/* **********************************************************
 * Copyright (c) 2010-2014 Google, Inc.  All rights reserved.
 * Copyright (c) 2000-2010 VMware, Inc.  All rights reserved.
 * **********************************************************/

/*
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * * Neither the name of VMware, Inc. nor the names of its contributors may be
 *   used to endorse or promote products derived from this software without
 *   specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2000-2001 Hewlett-Packard Company */

/* file "emit_utils_shared.c" */
/* The Pentium processors maintain cache consistency in hardware, so we don't
 * worry about getting stale cache entries.
 */
/* FIXME i#1551: flush code cache after update it on ARM because the hardware
 * does not maintain cache consistency in hardware.
 */

#include "../globals.h"
#include "../link.h"
#include "../fragment.h"
#include "../fcache.h"
#include "../emit.h"

#include "arch.h"
#include "instr.h"
#include "instr_create.h"
#include "instrlist.h"
#include "instrument.h" /* for dr_insert_call() */
#include "proc.h"
#include <string.h> /* for memcpy */
#include "decode.h"
#include "decode_fast.h"
#include "x86/decode_private.h"
#ifdef DEBUG
# include "disassemble.h"
#endif
#include <limits.h> /* for UCHAR_MAX */
#include "../perscache.h"

#ifdef VMX86_SERVER
# include "vmkuw.h"
#endif

/* fragment_t fields */
/* CAUTION: if TAG_OFFS changes from 0, must change indirect exit stub! */
#define FRAGMENT_START_PC_OFFS   (offsetof(fragment_t, start_pc))
#define FRAGMENT_COUNTER_OFFS    (offsetof(fragment_t, hot_counter))
#define FRAGMENT_PREFIX_SIZE_OFFS (offsetof(fragment_t, prefix_size))

#ifdef TRACE_HEAD_CACHE_INCR
/* linkstub_t field */
#  define LINKSTUB_TARGET_FRAG_OFFS  (offsetof(direct_linkstub_t, target_fragment))
#endif

#ifdef PROFILE_LINKCOUNT
#  define LINKSTUB_COUNT_OFFS        (offsetof(linkstub_t, count))
#endif


/* N.B.: I decided to not keep supporting DCONTEXT_IN_EDI
 * If we really want it later we can add it, it's a pain to keep
 * maintaining it with every change here
 */
#ifdef DCONTEXT_IN_EDI
# error DCONTEXT_IN_EDI Not Implemented
#endif

/* make code more readable by shortening long lines
 * we mark all as meta to avoid client interface asserts
 */
#define POST instrlist_meta_postinsert
#define PRE  instrlist_meta_preinsert
#define APP  instrlist_meta_append

/**
 ** CAUTION!
 **
 ** The following definitions and routines are highly dependent upon
 ** definitions made in x86.asm.  Do NOT change any constants or code
 ** without first consulting that file.
 **
 **/

/***************************************************************************
 ***************************************************************************
 ** EXIT STUB
 **
 ** WARNING: all exit stubs must support atomic linking and unlinking,
 ** meaning a link/unlink operation must involve a single store!
 ** There is an exception: a first-time link (detected using a sentinel
 ** LINKCOUNT_NEVER_LINKED_SENTINEL placed where the unlinked entry
 ** code will go once linked) does not need to be atomic.
 **/

/* FIXME i#1551: update the comment to x86/arm in this file */
/*
direct branch exit_stub:
   5x8  mov   %xax, xax_offs(&dcontext) or tls
  #if defined(PROFILE_LINKCOUNT)  (PR 248210: x64 not supported)
  | 1   lahf
  | 3   seto  %al
  |#if !defined(LINKCOUNT_64_BITS)
  | 6   inc   l->count
  |#else
  | 7   add   $1,l->count
  | 7   adc   $0,l->count+4
  |#endif
  | 2   add   $0x7f,%al
  | 1   sahf
  #endif
   5x10 mov   &linkstub, %xax
    5   jmp   target addr
#if defined(PROFILE_LINKCOUNT)  (PR 248210: x64 not supported)
|unlinked entry point:
|    5   movl  %eax, eax_offs(&dcontext)
|    5   movl  &linkstub, %eax
|    5   jmp   fcache_return
|
|  Notes: we link/unlink by modifying the 1st jmp to either target unlinked
|  entry point or the target fragment.  When we link for the first time
|  we try to remove the eflags save/restore, shifting the 1st jmp up (the
|  space between it and unlinked entry just becomes junk).
#endif

indirect branch exit_stub (only used if -indirect_stubs):
   6x9  mov   %xbx, xbx_offs(&dcontext) or tls
   5x11 mov   &linkstub, %xbx
    5   jmp   indirect_branch_lookup

indirect branches use xbx so that the flags can be saved into xax using
the lahf instruction!
xref PR 249775 on lahf support on x64.

for PROFILE_LINKCOUNT, the count increment is performed inside the
hashtable lookup (in both linked and unlinked paths) both since the flags
are saved there for the linked path and to save space in stubs

also see emit_inline_ibl_stub() below

*/

/* DIRECT_EXIT_STUB_SIZE is in arch_exports.h */
#define STUB_DIRECT_SIZE(flags) DIRECT_EXIT_STUB_SIZE(flags)

/* for -thread_private, we're relying on the fact that
 * SIZE32_MOV_XBX_TO_TLS == SIZE32_MOV_XBX_TO_ABS, and that
 * x64 always uses tls
 */
#define STUB_INDIRECT_SIZE32 \
    (SIZE32_MOV_XBX_TO_TLS + SIZE32_MOV_PTR_IMM_TO_XAX + JMP_LONG_LENGTH)
#define STUB_INDIRECT_SIZE64 \
    (SIZE64_MOV_XBX_TO_TLS + SIZE64_MOV_PTR_IMM_TO_XAX + JMP_LONG_LENGTH)
#define STUB_INDIRECT_SIZE(flags) \
    (FRAG_IS_32(flags) ? STUB_INDIRECT_SIZE32 : STUB_INDIRECT_SIZE64)

/* STUB_COARSE_DIRECT_SIZE is in arch_exports.h */
#define STUB_COARSE_INDIRECT_SIZE(flags) (STUB_INDIRECT_SIZE(flags))

#ifndef LINKCOUNT_64_BITS
#  define LINKCOUNT_INCSIZE (6)
#else
#  define LINKCOUNT_INCSIZE (7+7)
#endif
#define LINKCOUNT_EFLAGS_SAVE (3+1)
#define LINKCOUNT_EFLAGS_RESTORE (2+1)
#define LINKCOUNT_FLAGSIZE (LINKCOUNT_EFLAGS_SAVE + LINKCOUNT_EFLAGS_RESTORE)

#define LINKCOUNT_DIRECT_EXTRA(flags) \
    (LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(flags))
#define LINKCOUNT_UNLINKED_ENTRY(flags) \
    (LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(flags))

/* used to distinguish a never-linked direct exit -- once linked this
 * will be replaced by the beginning of the unlink entry point, which is
 * a save of xax, which will never look like this.  we choose nops to
 * avoid complicating our disassembly routines.
 */
#define LINKCOUNT_NEVER_LINKED_SENTINEL 0x90909090

/* Return size in bytes required for an exit stub with specified
 * target and FRAG_ flags
 */
int
exit_stub_size(dcontext_t *dcontext, cache_pc target, uint flags)
{
    if (TEST(FRAG_COARSE_GRAIN, flags)) {
        /* For coarse: bb building points at bb ibl, and then insert_exit_stub
         * changes that to the appropriate coarse prefix.  So the emit() calls to
         * this routine pass in a real ibl.  But any later calls, e.g. for
         * disassembly, that ask linkstub_size() will call EXIT_TARGET_TAG() which
         * calls indirect_linkstub_target() which returns get_coarse_ibl_prefix():
         * which then is not recognized as indirect by this routine!
         * Note that coarse_indirect_stub_jmp_target() derefs the prefix:
         * should we require callers who have stub pc to call that instead of us
         * de-referencing?
         */
        target = coarse_deref_ibl_prefix(dcontext, target);
    }
    if (is_indirect_branch_lookup_routine(dcontext, target)) {
        /* indirect branch */

        /* FIXME: Since we don't have the stub flags we'll lookup the
         * target routine's template in a very roundabout fashion here
         * by dispatching on the ibl_routine entry point
         */
        ibl_code_t *ibl_code;

        ibl_type_t ibl_type;
        IF_X64(gencode_mode_t mode;)
        DEBUG_DECLARE(bool is_ibl = )
            get_ibl_routine_type_ex(dcontext, target, &ibl_type _IF_X64(&mode));
        ASSERT(is_ibl);
        IF_X64(ASSERT(mode == FRAGMENT_GENCODE_MODE(flags) ||
                      (DYNAMO_OPTION(x86_to_x64) && mode == GENCODE_X86_TO_X64)));
        ibl_code = get_ibl_routine_code_ex(dcontext, ibl_type.branch_type, flags
                                           _IF_X64(mode));

        if (!EXIT_HAS_STUB(ibltype_to_linktype(ibl_code->branch_type),
                           IBL_FRAG_FLAGS(ibl_code)))
            return 0;

        if (TEST(FRAG_COARSE_GRAIN, flags)) {
            IF_WINDOWS(ASSERT(!is_shared_syscall_routine(dcontext, target)));
            /* keep in synch w/ coarse_indirect_stub_size() */
            return (STUB_COARSE_INDIRECT_SIZE(flags));
        }

#ifdef WINDOWS
        if (is_shared_syscall_routine(dcontext, target)) {
            return INTERNAL_OPTION(shared_syscalls_fastpath) ? 5 :
                STUB_INDIRECT_SIZE(flags);
        }
#endif

        if (ibl_code->ibl_head_is_inlined)
            return ibl_code->inline_stub_length;
        else
            return (STUB_INDIRECT_SIZE(flags));
    } else {
        /* direct branch */
        if (TEST(FRAG_COARSE_GRAIN, flags))
            return (STUB_COARSE_DIRECT_SIZE(flags));
#ifdef PROFILE_LINKCOUNT
        if (dynamo_options.profile_counts && (flags & FRAG_IS_TRACE) != 0)
            return (STUB_DIRECT_SIZE(flags) + LINKCOUNT_DIRECT_EXTRA(flags));
        else {
#endif
            return (STUB_DIRECT_SIZE(flags));
#ifdef PROFILE_LINKCOUNT
        }
#endif
    }
}

static bool
is_patchable_exit_stub_helper(dcontext_t *dcontext, cache_pc ltarget,
                              ushort lflags, uint fflags)
{
    if (LINKSTUB_INDIRECT(lflags)) {
        /*indirect */
        if (!DYNAMO_OPTION(indirect_stubs))
            return false;
        if (
#ifdef WINDOWS
            !is_shared_syscall_routine(dcontext, ltarget) &&
#endif
            get_ibl_routine_code(dcontext, extract_branchtype(lflags), fflags)
                ->ibl_head_is_inlined) {
            return !DYNAMO_OPTION(atomic_inlined_linking);
        } else {
            return true;
        }
    } else {
        /* direct */
        ASSERT(LINKSTUB_DIRECT(lflags));
#if defined(PROFILE_LINKCOUNT) || defined(TRACE_HEAD_CACHE_INCR)
        return true;
#else
        return false;
#endif
    }
}

bool
is_patchable_exit_stub(dcontext_t *dcontext, linkstub_t *l, fragment_t *f)
{
    return is_patchable_exit_stub_helper(dcontext, EXIT_TARGET_TAG(dcontext, f, l),
                                         l->flags, f->flags);
}

bool
is_exit_cti_stub_patchable(dcontext_t *dcontext, instr_t *inst, uint frag_flags)
{
    app_pc target;
    /* we figure out what the linkstub flags should be
     * N.B.: we have to be careful to match the LINKSTUB_ macros
     */
    ushort lflags = (ushort) instr_exit_branch_type(inst);
    ASSERT_TRUNCATE(lflags, ushort, instr_exit_branch_type(inst));
    ASSERT(instr_is_exit_cti(inst));
    target = instr_get_branch_target_pc(inst);
    if (is_indirect_branch_lookup_routine(dcontext, target)) {
        lflags |= LINK_INDIRECT;
    } else {
        lflags |= LINK_DIRECT;
    }
    return is_patchable_exit_stub_helper(dcontext, target, lflags, frag_flags);
}

uint
bytes_for_exitstub_alignment(dcontext_t *dcontext, linkstub_t *l,
                             fragment_t *f, byte *startpc)
{
    if (is_patchable_exit_stub(dcontext, l, f)) {
        /* assumption - we only hot patch the ending jmp of the exit stub
         * (and that exit stub size returns the right values) */
        ptr_uint_t shift = ALIGN_SHIFT_SIZE
            (startpc +
             exit_stub_size(dcontext, EXIT_TARGET_TAG(dcontext, f, l), f->flags) -
             EXIT_STUB_PATCH_OFFSET,
             EXIT_STUB_PATCH_SIZE, PAD_JMPS_ALIGNMENT);
#ifdef PROFILE_LINKCOUNT
        /* assumption doesn't hold because of the optimize ... */
        /* FIXME : once this is implemented re-enable the ifdefed out stats
         * in emit_fragment_common */
        ASSERT_NOT_IMPLEMENTED(false);
#endif
        IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(shift)));
        return (uint) shift;
    }
    return 0;
}

/* Returns an upper bound on the number of bytes that will be needed to add
 * this fragment to a trace */
uint
extend_trace_pad_bytes(fragment_t *add_frag)
{
    /* FIXME : this is a poor estimate, we could do better by looking at the
     * linkstubs and checking if we are inlining ibl, but since this is just
     * used by monitor.c for a max size check should be fine to overestimate
     * we'll just end up with slightly shorter max size traces */
    /* we don't trace through traces in normal builds, so don't worry about
     * number of exits (FIXME this also assumes bbs don't trace through
     * conditional or indirect branches) */
    ASSERT_NOT_IMPLEMENTED(!TEST(FRAG_IS_TRACE, add_frag->flags));
    /* Also, if -pad_jmps_shift_bb we assume that we don't need to remove
     * any nops from fragments added to traces since there shouldn't be any if
     * we only add bbs (nop_pad_ilist has an assert that verifies we don't add
     * any nops to bbs when -pad_jmps_shift_bb without marking as CANNOT_BE_TRACE,
     * so here we also verify that we only add bbs) - Xref PR 215179, UNIX syscall
     * fence exits and CLIENT_INTERFACE added/moved exits can lead to bbs with
     * additional hot_patchable locations.  We mark such bb fragments as CANNOT_BE_TRACE
     * in nop_pad_ilist() if -pad_jmps_mark_no_trace is set or assert otherwise to avoid
     * various difficulties so should not see them here. */
    /* A standard bb has at most 2 patchable locations (ends in conditional or ends
     * in indirect that is promoted to inlined). */
    return 2*MAX_PAD_SIZE;
}

/* return startpc shifted by the necessary bytes to pad patchable jmps of the
 * exit stub to proper alignment */
byte *
pad_for_exitstub_alignment(dcontext_t *dcontext, linkstub_t *l,
                           fragment_t *f, byte *startpc)
{
    uint shift;
    ASSERT(PAD_FRAGMENT_JMPS(f->flags)); /* shouldn't call this otherwise */

    shift = bytes_for_exitstub_alignment(dcontext, l, f, startpc);
    if (shift > 0) {
        /* Pad with 1 byte instructions so looks nice in debuggers.
         * decode_fragment also checks for this as a sanity check.  Note,
         * while these instructions can never be reached, they will be decoded
         * by shift fcache pointers so must put something valid here. */
        SET_TO_DEBUG(startpc, shift);
        startpc += shift;
        STATS_PAD_JMPS_ADD(f->flags, num_shifted_stubs, 1);
        STATS_PAD_JMPS_ADD(f->flags, shifted_stub_bytes, shift);
    } else {
        STATS_PAD_JMPS_ADD(f->flags, num_stubs_no_shift, 1);
    }
    return startpc;
}

/* Only used if -no_pad_jmps_shift_{bb,trace}. FIXME this routine is expensive (the
 * instr_expand) and we may end up removing app nops (an optimizations but
 * not really what we're after here). */
void
remove_nops_from_ilist(dcontext_t *dcontext, instrlist_t *ilist
                       _IF_DEBUG(bool recreating))
{
    instr_t *inst, *next_inst;

    for (inst = instrlist_first(ilist); inst != NULL; inst = next_inst) {
        /* FIXME : expensive, just expand instr before cti, function not used
         * if -no_pad_jmps_shift_{bb,trace} */
        inst = instr_expand(dcontext, ilist, inst);
        next_inst = instr_get_next(inst);
        if (instr_is_nop(inst)) {
            instrlist_remove(ilist, inst);
            DOSTATS({
                if (!recreating) {
                    STATS_INC(num_nops_removed);
                    STATS_ADD(num_nop_bytes_removed, instr_length(dcontext, inst));
                }
            });
            instr_destroy(dcontext, inst);
        }
    }
}

cache_pc
get_direct_exit_target(dcontext_t *dcontext, uint flags)
{
    if (FRAG_DB_SHARED(flags)) {
        if (TEST(FRAG_COARSE_GRAIN, flags)) {
            /* note that entrance stubs should target their unit's prefix,
             * who will then target this routine
             */
            return fcache_return_coarse_routine(IF_X64(FRAGMENT_GENCODE_MODE(flags)));
        } else
            return fcache_return_shared_routine(IF_X64(FRAGMENT_GENCODE_MODE(flags)));
    } else {
        return fcache_return_routine_ex(dcontext _IF_X64(FRAGMENT_GENCODE_MODE(flags)));
    }
}

int
insert_exit_stub(dcontext_t *dcontext, fragment_t *f,
                 linkstub_t *l, cache_pc stub_pc)
{
    return insert_exit_stub_other_flags(dcontext, f, l, stub_pc, l->flags);
}

/* Patch the (direct) branch at branch_pc so it branches to target_pc
 * The write that actually patches the branch is done atomically so this
 * function is safe with respect to a thread executing this branch presuming
 * that both the before and after targets are valid and that [pc, pc+4) does
 * not cross a cache line.
 */
void
patch_branch(cache_pc branch_pc, cache_pc target_pc, bool hot_patch)
{
    cache_pc byte_ptr = exit_cti_disp_pc(branch_pc);
    insert_relative_target(byte_ptr, target_pc, hot_patch);
}

#ifdef PROFILE_LINKCOUNT
static byte *
change_linkcount_target(byte *pc, app_pc target)
{
    /* Once we've linked once, we modify the jmp at the end of the
     * link code in the stub to either jmp to the unlinked entry
     * (which has no counter inc code of its own, that's why the exit
     * jmp doesn't go straight there) or to the target.
     * To find the jmp, watch first opcode to determine which state
     * stub is in (depending on whether had to save eflags or not).
     */
    if (*pc == 0xff || *pc == 0x83) { /* inc/add is 1st instr */
        pc += LINKCOUNT_INCSIZE + 1;
    } else {
        IF_X64(ASSERT_NOT_IMPLEMENTED(false)); /* need to pass in flags */
        pc += LINKCOUNT_INCSIZE + LINKCOUNT_FLAGSIZE + STUB_DIRECT_SIZE(FRAG_32_BIT) - 4;
    }
    pc = insert_relative_target(pc, target, HOT_PATCHABLE);
    return pc;
}

static void
optimize_linkcount_stub(dcontext_t *dcontext, fragment_t *f,
                        linkstub_t *l, fragment_t *targetf)
{
    /* first-time link: try to remove eflags save/restore */
# ifdef CUSTOM_EXIT_STUBS
    byte *stub_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l);
# else
    byte *stub_pc = (byte *) EXIT_STUB_PC(dcontext, f, l);
# endif
    byte *pc = stub_pc;
    bool remove_eflags_save = false;
    ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
    ASSERT(LINKSTUB_DIRECT(l->flags));

    if (!INTERNAL_OPTION(unsafe_ignore_eflags_prefix)) {
        remove_eflags_save = TEST(FRAG_WRITES_EFLAGS_6, targetf->flags);
    }
    else {
        /* scan through code at target fragment, stop scanning at 1st branch */
        uint eflags = 0;
        cache_pc end_pc = EXIT_CTI_PC(f, FRAGMENT_EXIT_STUBS(targetf));
        byte *fpc = (byte *) FCACHE_ENTRY_PC(targetf);
        /* for simplicity, stop at first instr that touches the flags */
        while (eflags == 0 && fpc != NULL && ((cache_pc)fpc) < end_pc) {
            fpc = decode_eflags_usage(dcontext, fpc, &eflags);
        }
        remove_eflags_save =
            (eflags & (EFLAGS_WRITE_6|EFLAGS_READ_6)) == EFLAGS_WRITE_6;
    }
    if (remove_eflags_save) {
        /* the 6 flags modified by add and adc are written before
         * they're read -> don't need to save eflags!
         *
         * I tried replacing lahf & sahf w/ nops, it's noticeably
         * faster to not have the nops, so redo the increment:
         */
        pc = insert_linkcount_inc(pc, l);
        pc = insert_relative_jump(pc, FCACHE_ENTRY_PC(targetf),
                                  NOT_HOT_PATCHABLE);
        /* Fill out with nops till the unlinked entry point so disassembles
         * nicely for logfile (we're profile linkcount so presumably going
         * to dump this). */
        while (pc < (stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) {
            *pc = 0x90; pc++;  /* nop */
        }
    } else {
        /* keep eflags save & restore -- need to keep save of eax
         * so skip all that now, go to right before store of &l into eax
         */
        pc += LINKCOUNT_DIRECT_EXTRA(f->flags) - 5 - 5;
        /* need to insert a restore of eax -- luckily it perfectly
         * overwrites the store of &l into eax, FIXME - dangerous
         * though, if we ever drop the addr16 flag on a shared restore the
         * instruction will be 6 bytes and our hardcoded 5 above will
         * lead to a crash (should trigger assert below at least).
         */
        pc = insert_restore_xax(dcontext, pc, f->flags, FRAG_DB_SHARED(f->flags),
                                DIRECT_STUB_SPILL_SLOT, true);
        ASSERT(pc == stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags) - 5);
        /* now add jmp */
        pc = insert_relative_jump(pc, FCACHE_ENTRY_PC(targetf),
                                  NOT_HOT_PATCHABLE);
    }

    /* we need to replace our never-linked sentinel w/ the real
     * unlinked entry point.
     */
    ASSERT(pc == stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags));
    pc = stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags);
    ASSERT(*((uint *)pc) == LINKCOUNT_NEVER_LINKED_SENTINEL);
    pc = insert_save_xax(dcontext, pc, f->flags, FRAG_DB_SHARED(f->flags),
                         DIRECT_STUB_SPILL_SLOT, true);
    /* mov $linkstub_ptr,%xax */
    *pc = MOV_IMM2XAX_OPCODE; pc++;
    IF_X64(ASSERT_NOT_IMPLEMENTED(false));
    *((uint *)pc) = (uint)l; pc += 4;
    /* jmp to target */
    pc = insert_relative_jump(pc, get_direct_exit_target(dcontext, f->flags),
                              NOT_HOT_PATCHABLE);
}
#endif /* PROFILE_LINKCOUNT */

/* Checks patchable exit cti for proper alignment for patching. If it's
 * properly aligned returns 0, else returns the number of bytes it would
 * need to be forward shifted to be properly aligned */
uint
patchable_exit_cti_align_offs(dcontext_t *dcontext, instr_t *inst, cache_pc pc)
{
    /* all our exit cti's currently use 4 byte offsets */
    /* FIXME : would be better to use a instr_is_cti_long or some such
     * also should check for addr16 flag (we shouldn't have any prefixes) */
    ASSERT((instr_is_cti(inst) && !instr_is_cti_short(inst) &&
            !TESTANY(~(PREFIX_JCC_TAKEN|PREFIX_JCC_NOT_TAKEN), instr_get_prefixes(inst)))
            || instr_is_cti_short_rewrite(inst, NULL));
    IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint
                  (ALIGN_SHIFT_SIZE(pc + instr_length(dcontext, inst) - CTI_PATCH_SIZE,
                                    CTI_PATCH_SIZE, PAD_JMPS_ALIGNMENT))));
    return (uint) ALIGN_SHIFT_SIZE(pc + instr_length(dcontext, inst) - CTI_PATCH_SIZE,
                                   CTI_PATCH_SIZE, PAD_JMPS_ALIGNMENT);
}

/* Returns true if the exit cti is ever dynamically modified */
bool
is_exit_cti_patchable(dcontext_t *dcontext, instr_t *inst, uint frag_flags)
{
    app_pc target;
    if (TEST(FRAG_COARSE_GRAIN, frag_flags)) {
        /* Case 8647: coarse grain fragment bodies always link through stubs
         * until frozen, so their ctis are never patched except at freeze time
         * when we suspend the world.
         */
        ASSERT(!TEST(FRAG_IS_TRACE, frag_flags));
        return false;
    }
    ASSERT(instr_is_exit_cti(inst));
    target = instr_get_branch_target_pc(inst);
    if (is_indirect_branch_lookup_routine(dcontext, target)) {
        /* whether has an inline stub or not, cti is always
         * patched if -no_indirect_stubs
         */
        if (!DYNAMO_OPTION(indirect_stubs))
            return true;
#ifdef WINDOWS
        if (target != shared_syscall_routine(dcontext)) {
#endif
            return get_ibl_routine_code(dcontext,
                     extract_branchtype((ushort)instr_exit_branch_type(inst)),
                                        frag_flags)->ibl_head_is_inlined;
#ifdef WINDOWS
        }
        return false;
#endif
    } else {
        /* direct exit */
#ifdef PROFILE_LINKCOUNT
        if (DYNAMO_OPTION(profile_counts) && TEST(FRAG_IS_TRACE, frag_flags)) {
# ifdef CUSTOM_EXIT_STUBS
            return true;
# else
            return false;
# endif
        }
#endif
        if (instr_branch_special_exit(inst))
            return false;
        return true;
    }
}

/* returns true if exit cti no longer points at stub
 * (certain situations, like profiling or TRACE_HEAD_CACHE_INCR, go
 * through the stub even when linked)
 */
bool
link_direct_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l, fragment_t *targetf,
                 bool hot_patch)
{
#if defined(PROFILE_LINKCOUNT) || defined(TRACE_HEAD_CACHE_INCR)
# ifdef CUSTOM_EXIT_STUBS
    byte *stub_pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l));
# else
    byte *stub_pc = (byte *) (EXIT_STUB_PC(dcontext, f, l));
# endif
#endif
    ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
    ASSERT(LINKSTUB_DIRECT(l->flags));
    STATS_INC(num_direct_links);

#ifdef PROFILE_LINKCOUNT
    if (dynamo_options.profile_counts && TEST(FRAG_IS_TRACE, f->flags)) {
        /* do not change the exit jmp, instead change the stub itself */
        if (*((uint *)(stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) ==
            LINKCOUNT_NEVER_LINKED_SENTINEL) {
            /* this is not atomic, but that's ok, it's first-time only */
            /* FIXME - this assumption is so not safe with shared cache
             * since we add to table and link incoming before linking outgoing
             */
            optimize_linkcount_stub(dcontext, f, l, targetf);
# ifdef CUSTOM_EXIT_STUBS
            /* FIXME: want flag that says whether should go through custom
             * only when unlinked, or always!
             * For now we assume only when unlinked:
             */
            /* skip custom code */
            patch_branch(EXIT_CTI_PC(f, l), stub_pc,
                         TEST(FRAG_SHARED, f->flags) ? hot_patch : NOT_HOT_PATCHABLE);
# endif
        } else {
# ifdef CUSTOM_EXIT_STUBS
            /* FIXME: want flag that says whether should go through custom
             * only when unlinked, or always!
             * For now we assume only when unlinked:
             */
            /* skip custom code */
            patch_branch(EXIT_CTI_PC(f, l), stub_pc, hot_patch);
# endif
            change_linkcount_target(stub_pc, FCACHE_ENTRY_PC(targetf));
        }
# ifdef TRACE_HEAD_CACHE_INCR
        /* yes, we wait for linkcount to do its thing and then we change it --
         * but to make it more efficient will make this already ungainly
         * code even harder to read
         */
        /* FIXME - atomicity issues? */
        if ((targetf->flags & FRAG_IS_TRACE_HEAD) != 0) {
            /* after optimized inc, jmp to unlinked code, but change its final
             * jmp to go to incr routine
             */
            change_linkcount_target(stub_pc, stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags));
            LOG(THREAD, LOG_LINKS, 4,
                "\tlinking F%d."PFX" to incr routine b/c F%d is trace head\n",
                f->id, EXIT_CTI_PC(f, l), targetf->id);
            patch_branch(stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags) + 10,
                         trace_head_incr_routine(dcontext), hot_patch);
        }
# endif
        return false; /* going through stub */
    }
#endif /* PROFILE_LINKCOUNT */

#ifdef TRACE_HEAD_CACHE_INCR
    if ((targetf->flags & FRAG_IS_TRACE_HEAD) != 0) {
        LOG(THREAD, LOG_LINKS, 4,
            "\tlinking F%d."PFX" to incr routine b/c F%d is trace head\n",
            f->id, EXIT_CTI_PC(f, l), targetf->id);
        /* FIXME: more efficient way than multiple calls to get size-5? */
        ASSERT(linkstub_size(dcontext, f, l) == DIRECT_EXIT_STUB_SIZE(f->flags));
        patch_branch(stub_pc + DIRECT_EXIT_STUB_SIZE(f->flags) - 5,
                     trace_head_incr_routine(dcontext), hot_patch);
        return false; /* going through stub */
    }
#endif

    /* change jmp target to point to the passed-in target */
#ifdef UNSUPPORTED_API
    if ((l->flags & LINK_TARGET_PREFIX) != 0) {
        /* want to target just the xcx restore, not the eflags restore
         * (only ibl targets eflags restore)
         */
        patch_branch(EXIT_CTI_PC(f, l), FCACHE_PREFIX_ENTRY_PC(targetf),
                     hot_patch);
    } else
#endif
        patch_branch(EXIT_CTI_PC(f, l), FCACHE_ENTRY_PC(targetf), hot_patch);
    return true; /* do not need stub anymore */
}

void
unlink_direct_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
    cache_pc stub_pc = (cache_pc) EXIT_STUB_PC(dcontext, f, l);
#ifdef TRACE_HEAD_CACHE_INCR
    direct_linkstub_t *dl = (direct_linkstub_t *) l;
#endif
    ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
    ASSERT(LINKSTUB_DIRECT(l->flags));

#ifdef PROFILE_LINKCOUNT
    if (dynamo_options.profile_counts && TEST(FRAG_IS_TRACE, f->flags)) {
        byte *pc;
        if (*((uint *)(stub_pc + LINKCOUNT_DIRECT_EXTRA(f->flags))) ==
            LINKCOUNT_NEVER_LINKED_SENTINEL) {
            /* never been linked, don't go pointing at the uninitialized
             * unlink entry point -- just return, initial state is fine
             */
            return;
        }
# ifdef CUSTOM_EXIT_STUBS
        pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l));
        stub_pc = (cache_pc) pc;
        /* FIXME: want flag that says whether should go through custom
         * only when unlinked, or always! Also is racy with 2nd branch patch.
         * For now we assume only when unlinked.
         */
        /* go through custom code again */
        patch_branch(EXIT_CTI_PC(f, l), stub_pc, HOT_PATCHABLE);
# else
        pc = (byte *) stub_pc;
# endif
# ifdef TRACE_HEAD_CACHE_INCR
        if (dl->target_fragment != NULL) { /* HACK to tell if targeted trace head */
            /* make unlinked jmp go back to fcache_return */
            patch_branch(pc + LINKCOUNT_UNLINKED_ENTRY(f->flags) + 10,
                         get_direct_exit_target(dcontext, f->flags),
                         HOT_PATCHABLE);
        } else
# endif
            /* make jmp after incr go to unlinked entry */
            change_linkcount_target(pc, stub_pc + LINKCOUNT_UNLINKED_ENTRY(f->flags));
        return;
    }
#endif

#ifdef TRACE_HEAD_CACHE_INCR
    if (dl->target_fragment != NULL) { /* HACK to tell if targeted trace head */
# ifdef CUSTOM_EXIT_STUBS
        byte *pc = (byte *) (EXIT_FIXED_STUB_PC(dcontext, f, l));
# else
        byte *pc = (byte *) (EXIT_STUB_PC(dcontext, f, l));
# endif
        /* FIXME: more efficient way than multiple calls to get size-5? */
        ASSERT(linkstub_size(dcontext, f, l) == DIRECT_EXIT_STUB_SIZE(f->flags));
        patch_branch(pc + DIRECT_EXIT_STUB_SIZE(f->flags) - 5,
                     get_direct_exit_target(dcontext, f->flags),
                     HOT_PATCHABLE);
    }
#endif

    /* change jmp target to point to top of exit stub */
    patch_branch(EXIT_CTI_PC(f, l), stub_pc, HOT_PATCHABLE);
}

/* NOTE : for inlined indirect branches linking is !NOT! atomic with respect
 * to a thread executing in the cache unless using the atomic_inlined_linking
 * option (unlike unlinking)
 */
void
link_indirect_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l, bool hot_patch)
{
    app_pc target_tag = EXIT_TARGET_TAG(dcontext, f, l);
    /* w/ indirect exits now having their stub pcs computed based
     * on the cti targets, we must calculate them at a consistent
     * state (we do have multi-stage modifications for inlined stubs)
     */
    byte *stub_pc = (byte *) EXIT_STUB_PC(dcontext, f, l);
#ifdef CUSTOM_EXIT_STUBS
    byte *fixed_stub_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l);
#endif

    ASSERT(!TEST(FRAG_COARSE_GRAIN, f->flags));

    ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
    ASSERT(LINKSTUB_INDIRECT(l->flags));
    /* target is always the same, so if it's already linked, this is a nop */
    if ((l->flags & LINK_LINKED) != 0) {
        STATS_INC(num_indirect_already_linked);
        return;
    }
    STATS_INC(num_indirect_links);

# ifdef WINDOWS
    if (!is_shared_syscall_routine(dcontext, target_tag))
# endif
    {
        ibl_code_t *ibl_code =
            get_ibl_routine_code(dcontext,
                                 extract_branchtype(l->flags), f->flags);

        if (ibl_code->ibl_head_is_inlined) {
            /* need to make branch target the top of the exit stub */
            patch_branch(EXIT_CTI_PC(f, l), stub_pc, hot_patch);
            if (DYNAMO_OPTION(atomic_inlined_linking)) {
                return;
            }
        }
    }

    link_indirect_exit_arch(dcontext, f, l, hot_patch, target_tag);
}

int
linkstub_unlink_entry_offset(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
    ibl_code_t *ibl_code;
    ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
    if (!LINKSTUB_INDIRECT(l->flags))
        return 0;
#ifdef WINDOWS
    if (is_shared_syscall_routine(dcontext, EXIT_TARGET_TAG(dcontext, f, l)))
        return 0;
#endif
    ibl_code = get_ibl_routine_code(dcontext, extract_branchtype(l->flags), f->flags);
    if (ibl_code->ibl_head_is_inlined)
        return ibl_code->inline_unlink_offs;
    else
        return 0;
}

cache_pc
indirect_linkstub_target(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
    ASSERT(LINKSTUB_INDIRECT(l->flags));
    ASSERT(!TESTANY(LINK_NI_SYSCALL_ALL, l->flags));
#ifdef WINDOWS
    if (EXIT_TARGETS_SHARED_SYSCALL(l->flags)) {
        /* currently this is the only way to distinguish shared_syscall
         * exit from other indirect exits and from other exits in
         * a fragment containing ignorable or non-ignorable syscalls
         */
        ASSERT(TEST(FRAG_HAS_SYSCALL, f->flags));
        return shared_syscall_routine_ex(dcontext
                                         _IF_X64(FRAGMENT_GENCODE_MODE(f->flags)));
    }
#endif
    if (TEST(FRAG_COARSE_GRAIN, f->flags)) {
        /* Need to target the ibl prefix.  Passing in cti works as well as stub,
         * and avoids a circular dependence where linkstub_unlink_entry_offset()
         * call this routine to get the target and then this routine asks for
         * the stub which calls linkstub_unlink_entry_offset()...
         */
        return get_coarse_ibl_prefix(dcontext, EXIT_CTI_PC(f, l),
                                     extract_branchtype(l->flags));
    } else {
        return get_ibl_routine_ex(dcontext, get_ibl_entry_type(l->flags),
                                  get_source_fragment_type(dcontext, f->flags),
                                  extract_branchtype(l->flags)
                                  _IF_X64(FRAGMENT_GENCODE_MODE(f->flags)));
    }
}

/* based on machine state, returns which of cbr l1 and fall-through l2
 * must have been taken
 */
linkstub_t *
linkstub_cbr_disambiguate(dcontext_t *dcontext, fragment_t *f,
                          linkstub_t *l1, linkstub_t *l2)
{
    instr_t instr;
    linkstub_t *taken;
    instr_init(dcontext, &instr);
    decode(dcontext, EXIT_CTI_PC(f, l1), &instr);
    ASSERT(instr_is_cbr(&instr));
    if (instr_cbr_taken(&instr, get_mcontext(dcontext), false/*post-state*/))
        taken = l1;
    else
        taken = l2;
    instr_free(dcontext, &instr);
    return taken;
}


/*******************************************************************************
 * COARSE-GRAIN FRAGMENT SUPPORT
 */


/* FIXME: case 10334: pass in info? */
bool
coarse_is_trace_head(cache_pc stub)
{
    if (coarse_is_entrance_stub(stub)) {
        cache_pc tgt = entrance_stub_jmp_target(stub);
        /* FIXME: could see if tgt is a jmp and deref and cmp to
         * trace_head_return_coarse_routine() to avoid the vmvector
         * lookup required to find the prefix
         */
        return tgt == trace_head_return_coarse_prefix(stub, NULL);
    }
    return false;
}

cache_pc
entrance_stub_jmp_target(cache_pc stub)
{
    cache_pc jmp = entrance_stub_jmp(stub);
    cache_pc tgt;
    ASSERT(jmp != NULL);
    tgt = (cache_pc) PC_RELATIVE_TARGET(jmp+1);
#ifdef X86
    ASSERT(*jmp == JMP_OPCODE);
#elif defined(ARM)
    /* FIXMED i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
#endif /* X86/ARM */
    return tgt;
}

app_pc
entrance_stub_target_tag(cache_pc stub, coarse_info_t *info)
{
    cache_pc jmp = entrance_stub_jmp(stub);
    app_pc tag;
    /* find the immed that is put into tls: at end of pre-jmp instr */
#ifdef X64
    /* To identify whether 32-bit: we could look up the coarse_info_t
     * this is part of but that's expensive so we check whether the
     * tls offset has 2 high byte 0's (we always use addr16 for 32-bit).
     * 32-bit:
     *   67 64 c7 06 e0 0e 02 99 4e 7d  addr16 mov $0x7d4e9902 -> %fs:0x0ee0
     * 64-bit is split into high and low dwords:
     *   65 c7 04 25 20 16 00 00 02 99 4e 7d  mov $0x7d4e9902 -> %gs:0x1620
     *   65 c7 04 25 24 16 00 00 00 00 00 00  mov $0x00000000 -> %gs:0x1624
     * both are followed by a direct jmp.
     */
    if (*((ushort *)(jmp-6)) == 0) { /* 64-bit has 2 0's for high 2 bytes of tls offs */
        ptr_uint_t high32 = (ptr_uint_t) *((uint *)(jmp-4));
        ptr_uint_t low32 = (ptr_uint_t)
            *((uint *)(jmp - (SIZE64_MOV_PTR_IMM_TO_TLS/2) - 4));
        tag = (cache_pc) ((high32 << 32) | low32);
    } else { /* else fall-through to 32-bit case */
#endif
        tag = *((cache_pc *)(jmp-4));
#ifdef X64
    }
#endif
    /* if frozen, this could be a persist-time app pc (i#670).
     * we take in info so we can know mod_shift (we can decode to find it
     * for unlinked but not for linked)
     */
    if (info == NULL)
        info = get_stub_coarse_info(stub);
    if (info->mod_shift != 0 &&
        tag >= info->persist_base &&
        tag < info->persist_base + (info->end_pc - info->base_pc))
        tag -= info->mod_shift;
    return tag;
}

bool
coarse_is_indirect_stub(cache_pc pc)
{
    /* match insert_jmp_to_ibl */
    return instr_raw_is_tls_spill(pc, SCRATCH_REG1/*xbx/r1*/, INDIRECT_STUB_SPILL_SLOT);
}

/* caller should call fragment_coarse_entry_pclookup() ahead of time
 * to avoid deadlock if caller holds info->lock
 */
bool
coarse_cti_is_intra_fragment(dcontext_t *dcontext, coarse_info_t *info,
                             instr_t *inst, cache_pc start_pc)
{
    /* We don't know the size of the fragment but we want to support
     * intra-fragment ctis for clients (i#665) so we use some
     * heuristics.  A real cti is either linked to a target within the
     * same coarse unit (where its target will be an entry point) or
     * points at a stub of some kind (frozen exit prefix or separate
     * entrance stub or inlined indirect stub).
     */
    cache_pc tgt = opnd_get_pc(instr_get_target(inst));
    if (tgt < start_pc ||
        tgt >= start_pc + MAX_FRAGMENT_SIZE ||
        /* if tgt is an entry, then it's a linked exit cti
         * XXX: this may acquire info->lock if it's never been called before
         */
        fragment_coarse_entry_pclookup(dcontext, info, tgt) != NULL ||
        /* these lookups can get expensive but should only hit them
         * when have clients adding intra-fragment ctis.
         * XXX: is there a min distance we could use to rule out
         * being in stubs?  for frozen though prefixes are
         * right after cache.
         */
        coarse_is_indirect_stub(tgt) ||
        in_coarse_stubs(tgt) ||
        in_coarse_stub_prefixes(tgt)) {
        return false;
    } else
        return true;
}

cache_pc
coarse_indirect_stub_jmp_target(cache_pc stub)
{
#ifdef X86
    cache_pc prefix_tgt, tgt;
    cache_pc jmp;
    size_t stub_size;
# ifdef X64
    /* See the stub sequences in entrance_stub_target_tag(): 32-bit always has
     * an addr prefix while 64-bit does not
     */
    /* FIXME: PR 209709: test perf and remove if outweighs space */
    if (*stub == ADDR_PREFIX_OPCODE)
        stub_size = STUB_COARSE_INDIRECT_SIZE(FRAG_32_BIT);
    else /* default */
# endif
        stub_size = STUB_COARSE_INDIRECT_SIZE(0);
    jmp = stub + stub_size - JMP_LONG_LENGTH;
    ASSERT(*jmp == JMP_OPCODE);
    prefix_tgt = (cache_pc) PC_RELATIVE_TARGET(jmp+1);
    ASSERT(*prefix_tgt == JMP_OPCODE);
    tgt = (cache_pc) PC_RELATIVE_TARGET(prefix_tgt+1);
    return tgt;
#elif defined(ARM)
    /* FIXMED i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
    return NULL;
#endif /* X86/ARM */
}

uint
coarse_indirect_stub_size(coarse_info_t *info)
{
    /* Keep in synch w/ exit_stub_size().  We export this separately since
     * it's difficult to get the target to pass to exit_stub_size().
     */
    return STUB_COARSE_INDIRECT_SIZE(COARSE_32_FLAG(info));
}

/* Passing in stub's info avoids a vmvector lookup */
bool
entrance_stub_linked(cache_pc stub, coarse_info_t *info /*OPTIONAL*/)
{
    /* entrance stubs are of two types:
     * - targeting trace heads: always point to trace_head_return_coarse,
     *   whether target exists or not, so are always unlinked;
     * - targeting non-trace-heads: if linked, point to fragment; if unlinked,
     *   point to fcache_return_coarse
     */
    cache_pc tgt = entrance_stub_jmp_target(stub);
    /* FIXME: do vmvector just once instead of for each call */
    return (tgt != trace_head_return_coarse_prefix(stub, info) &&
            tgt != fcache_return_coarse_prefix(stub, info));
}

/* Returns whether it had to change page protections */
static bool
patch_coarse_branch(cache_pc stub, cache_pc tgt, bool hot_patch,
                    coarse_info_t *info /*OPTIONAL*/)
{
    bool stubs_readonly = false;
    bool stubs_restore = false;
    if (DYNAMO_OPTION(persist_protect_stubs)) {
        if (info == NULL)
            info = get_stub_coarse_info(stub);
        ASSERT(info != NULL);
        if (info->stubs_readonly) {
            stubs_readonly = true;
            stubs_restore = true;
            /* if we don't preserve mapped-in COW state the protection change
             * will fail (case 10570)
             */
            make_copy_on_writable((byte *)PAGE_START(entrance_stub_jmp(stub)),
                                  /* stub jmp can't cross page boundary (can't
                                   * cross cache line in fact) */
                                  PAGE_SIZE);
            if (DYNAMO_OPTION(persist_protect_stubs_limit) > 0) {
                info->stubs_write_count++;
                if (info->stubs_write_count >
                    DYNAMO_OPTION(persist_protect_stubs_limit)) {
                    SYSLOG_INTERNAL_WARNING_ONCE("pcache stubs over write limit");
                    STATS_INC(pcache_unprot_over_limit);
                    stubs_restore = false;
                    info->stubs_readonly = false;
                }
            }
        }
    }
    patch_branch(entrance_stub_jmp(stub), tgt, HOT_PATCHABLE);
    if (stubs_restore)
        make_unwritable((byte *)PAGE_START(entrance_stub_jmp(stub)), PAGE_SIZE);
    return stubs_readonly;
}

/* Passing in stub's info avoids a vmvector lookup */
void
link_entrance_stub(dcontext_t *dcontext, cache_pc stub, cache_pc tgt,
                   bool hot_patch, coarse_info_t *info /*OPTIONAL*/)
{
    ASSERT(DYNAMO_OPTION(coarse_units));
    ASSERT(self_owns_recursive_lock(&change_linking_lock));
    LOG(THREAD, LOG_LINKS, 5, "link_entrance_stub "PFX"\n", stub);
    if (patch_coarse_branch(stub, tgt, hot_patch, info))
        STATS_INC(pcache_unprot_link);
    /* We check this afterward since this link may be what makes it consistent
     * FIXME: pass in arg to not check target?  Then call before and after */
    ASSERT(coarse_is_entrance_stub(stub));
}

/* Passing in stub's info avoids a vmvector lookup */
void
unlink_entrance_stub(dcontext_t *dcontext, cache_pc stub, uint flags,
                     coarse_info_t *info /*OPTIONAL*/)
{
    cache_pc tgt;
    ASSERT(DYNAMO_OPTION(coarse_units));
    ASSERT(coarse_is_entrance_stub(stub));
    ASSERT(self_owns_recursive_lock(&change_linking_lock));
    LOG(THREAD, LOG_LINKS, 5,
        "unlink_entrance_stub "PFX"\n", stub);
    if (TESTANY(FRAG_IS_TRACE_HEAD|FRAG_IS_TRACE, flags))
        tgt = trace_head_return_coarse_prefix(stub, info);
    else
        tgt = fcache_return_coarse_prefix(stub, info);
    if (patch_coarse_branch(stub, tgt, HOT_PATCHABLE, info))
        STATS_INC(pcache_unprot_unlink);
}

cache_pc
entrance_stub_from_cti(cache_pc cti)
{
    cache_pc disp = exit_cti_disp_pc(cti);
    cache_pc tgt = (cache_pc) PC_RELATIVE_TARGET(disp);
    return tgt;
}

/*******************************************************************************/

/* Patch list support routines */
void
init_patch_list(patch_list_t *patch, patch_list_type_t type)
{
    patch->num_relocations = 0;
    /* Cast to int to avoid a tautological comparison warning from clang. */
    ASSERT_TRUNCATE(patch->type, ushort, (int)type);
    patch->type = (ushort) type;
}

/* add an instruction to patch list and address of location for future updates */
/* Use the type checked wrappers add_patch_entry or add_patch_marker */
void
add_patch_entry_internal(patch_list_t *patch, instr_t *instr, ushort patch_flags,
                         short instruction_offset,
                         ptr_uint_t value_location_offset)
{
    uint i = patch->num_relocations;

    ASSERT(patch->num_relocations < MAX_PATCH_ENTRIES);
    /* Since in debug build we have the extra slots for stats, it's important
     * to provide a useful release build message
     */
    if (patch->num_relocations >= MAX_PATCH_ENTRIES) {
        SYSLOG_CUSTOM_NOTIFY(SYSLOG_CRITICAL, MSG_EXCEPTION, 4,
                             "Maximum patch entries exceeded",
                             get_application_name(), get_application_pid(),
                             "<maxpatch>", "Maximum patch entries exceeded");
        os_terminate(get_thread_private_dcontext(), TERMINATE_PROCESS);
        ASSERT_NOT_REACHED();
    }

    LOG(THREAD_GET, LOG_EMIT, 4,
        "add_patch_entry[%d] value_location_offset="PFX"\n", i,
        value_location_offset);

    patch->entry[i].where.instr = instr;
    patch->entry[i].patch_flags = patch_flags;
    patch->entry[i].value_location_offset = value_location_offset;
    patch->entry[i].instr_offset = instruction_offset;

    patch->num_relocations++;
}


/* add an instruction to patch list to retrieve its offset later.
   Takes an instruction and an offset within the instruction.
   Result: The offset within an encoded instruction stream will
   be stored in target_offset by encode_with_patch_list
*/
void
add_patch_marker(patch_list_t *patch, instr_t *instr, ushort patch_flags,
                 short instr_offset, ptr_uint_t *target_offset /* OUT */)
{
    add_patch_entry_internal(patch, instr, (ushort) (patch_flags | PATCH_MARKER),
                             instr_offset, (ptr_uint_t) target_offset);
}

/* remove PATCH_MARKER entries since not needed for dynamic updates */
static INLINE_ONCE void
remove_assembled_patch_markers(dcontext_t *dcontext, patch_list_t *patch)
{
    ushort i=0, j=0;

    /* we can remove the PATCH_MARKER entries after encoding,
       and so patch_emitted_code won't even need to check for PATCH_MARKER
    */

    while (j < patch->num_relocations) {
        if (TEST(PATCH_MARKER, patch->entry[j].patch_flags)) {
            LOG(THREAD, LOG_EMIT, 4,
                "remove_assembled_patch_markers: removing marker %d\n", j);
        } else {
            patch->entry[i] = patch->entry[j];
            i++;
        }

        j++;
    }

    LOG(THREAD, LOG_EMIT, 3, "remove_assembled_patch_markers: relocations %d, left only %d\n",
        patch->num_relocations, i);
    patch->num_relocations = i;
}


/* Indirect all instructions instead of later patching */
static void
relocate_patch_list(dcontext_t *dcontext, patch_list_t *patch,
                    instrlist_t *ilist)
{
    instr_t *inst;
    uint cur = 0;
    LOG(THREAD, LOG_EMIT, 3, "relocate_patch_list ["PFX"]\n", patch);

    /* go through the instructions and "relocate" by indirectly using XDI */
    for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) {
        if (cur < patch->num_relocations &&
            inst == patch->entry[cur].where.instr) {
            ASSERT(!TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags));

            if (!TEST(PATCH_MARKER, patch->entry[cur].patch_flags)) {
                opnd_t opnd;
                ASSERT(instr_num_srcs(inst) > 0);
                opnd = instr_get_src(inst, 0);

                DOLOG(4, LOG_EMIT, {
                    LOG(THREAD, LOG_EMIT, 2,
                        "encode_with_patch_list: patch_entry_t[%d] before update \n");
                    instr_disassemble(dcontext, inst, THREAD);
                    LOG(THREAD, LOG_EMIT, 2, "\n");
                });
                /* we assume that per_thread_t will be in XDI,
                   and the displacement is in value_location_offset */
                IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int
                              (patch->entry[cur].value_location_offset)));
                if (opnd_is_near_base_disp(opnd)) {
                    /* indirect through XDI and update displacement */
                    opnd_set_disp(&opnd, (int) patch->entry[cur].value_location_offset);
                    opnd_replace_reg(&opnd, REG_NULL, SCRATCH_REG5/*xdi/r5*/);
                } else if (opnd_is_immed_int(opnd)) {
                    /* indirect through XDI and set displacement */
                    /* converting AND $0x00003fff, %xcx -> %xcx
                       into       AND  mask(%xdi), %xcx -> %xcx
                    */
                    opnd = opnd_create_base_disp
                        (SCRATCH_REG5/*xdi/r5*/, REG_NULL, 0,
                         (int) patch->entry[cur].value_location_offset, OPSZ_4);
                }

                instr_set_src(inst, 0, opnd);
                DOLOG(3, LOG_EMIT, {
                    LOG(THREAD, LOG_EMIT, 2,
                        "encode_with_patch_list: patch_entry_t[%d] after update \n");
                    instr_disassemble(dcontext, inst, THREAD);
                    LOG(THREAD, LOG_EMIT, 2, "\n");
                });
            }
            cur++;
        }
    }
}

/* Updates patch list with offsets in assembled instruction list */
/* Cf: instrlist_encode which does not support a patch list */
/* Returns length of emitted code */
int
encode_with_patch_list(dcontext_t *dcontext, patch_list_t *patch,
                       instrlist_t *ilist, cache_pc start_pc)
{
    instr_t *inst;
    uint len;
    uint cur;
    cache_pc pc = start_pc;

    ASSERT(patch->num_relocations < MAX_PATCH_ENTRIES);

    if (patch->type == PATCH_TYPE_INDIRECT_XDI) {
        relocate_patch_list(dcontext, patch, ilist);
    }

    /* now encode the instructions */
    /* must set note fields first with offset */
    len = 0;
    for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) {
        instr_set_note(inst, (void *)(ptr_uint_t)len);
        len += instr_length(dcontext, inst);
    }

    cur = 0;
    /* after instruction list is assembled we collect the offsets */
    for (inst = instrlist_first(ilist); inst; inst = instr_get_next(inst)) {
        short offset_in_instr = patch->entry[cur].instr_offset;
        byte *nxt_pc = instr_encode(dcontext, inst, pc);
        ASSERT(nxt_pc != NULL);
        len = (int) (nxt_pc - pc);
        pc = nxt_pc;

        if (cur < patch->num_relocations &&
            inst == patch->entry[cur].where.instr) {
            ASSERT(!TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags));

            /* support positive offsets from beginning and negative - from end of instruction */
            if (offset_in_instr < 0) {
                /* grab offset offset_in_instr bytes from the end of instruction */
                /* most commonly -4 for a 32bit immediate  */
                patch->entry[cur].where.offset =
                    ((pc + offset_in_instr) - start_pc);
            } else {
                /* grab offset after skipping offset_in_instr from beginning of instruction */
                patch->entry[cur].where.offset =
                    ((pc - len + offset_in_instr) - start_pc);
            }
            patch->entry[cur].patch_flags |= PATCH_OFFSET_VALID;

            LOG(THREAD, LOG_EMIT, 4,
                "encode_with_patch_list: patch_entry_t[%d] offset="PFX"\n",
                cur, patch->entry[cur].where.offset);

            if (TEST(PATCH_MARKER, patch->entry[cur].patch_flags)) {
                /* treat value_location_offset as an output argument
                   and store there the computed offset,
                */
                ptr_uint_t *output_value = (ptr_uint_t *)
                    patch->entry[cur].value_location_offset;
                ptr_uint_t output_offset = patch->entry[cur].where.offset;
                if (TEST(PATCH_ASSEMBLE_ABSOLUTE, patch->entry[cur].patch_flags)) {
                    ASSERT(!TEST(PATCH_UINT_SIZED, patch->entry[cur].patch_flags));
                    output_offset += (ptr_uint_t)start_pc;
                }
                if (TEST(PATCH_UINT_SIZED, patch->entry[cur].patch_flags)) {
                    IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(output_offset)));
                    *((uint *)output_value) = (uint) output_offset;
                } else
                    *output_value = output_offset;
            }

            LOG(THREAD, LOG_EMIT, 4,
                "encode_with_patch_list [%d] extras patch_flags=0x%x value_offset="
                PFX"\n", cur, patch->entry[cur].patch_flags,
                patch->entry[cur].value_location_offset);
            cur++;
        }
    }

    /* assuming patchlist is in the same order as ilist, we should have seen all */
    LOG(THREAD, LOG_EMIT, 4, "cur %d, num %d", cur, patch->num_relocations);
    ASSERT(cur == patch->num_relocations);

    remove_assembled_patch_markers(dcontext, patch);
    ASSERT(CHECK_TRUNCATE_TYPE_int(pc - start_pc));
    return (int)(pc - start_pc);
}

#ifdef DEBUG
void
print_patch_list(patch_list_t *patch)
{
    uint i;
    LOG(THREAD_GET, LOG_EMIT, 4, "patch="PFX" num_relocations=%d\n",
        patch, patch->num_relocations);

    for(i=0; i<patch->num_relocations; i++) {
        ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[i].patch_flags));
        LOG(THREAD_GET, LOG_EMIT, 4,
            "patch_list [%d] offset="PFX" patch_flags=%d value_offset="PFX"\n", i,
            patch->entry[i].where.offset,
            patch->entry[i].patch_flags,
            patch->entry[i].value_location_offset);
    }
}

# ifdef INTERNAL
/* disassembles code adding patch list labels */
static void
disassemble_with_annotations(dcontext_t *dcontext, patch_list_t *patch,
                             byte *start_pc, byte *end_pc)
{
    byte *pc = start_pc;
    uint cur = 0;

    do {
        if (cur < patch->num_relocations &&
            pc >= start_pc + patch->entry[cur].where.offset) {
            ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[cur].patch_flags));
            /* this is slightly off - we'll mark next instruction,
               but is good enough for this purpose */
            LOG(THREAD, LOG_EMIT, 2, "%d:", cur);
            cur++;
        } else {
            LOG(THREAD, LOG_EMIT, 2, "  ");
        }

        pc = disassemble_with_bytes(dcontext, pc, THREAD);
    } while (pc < end_pc);
    LOG(THREAD, LOG_EMIT, 2, "\n");
}
# endif
#endif

/* updates emitted code according to patch list */
static void
patch_emitted_code(dcontext_t *dcontext, patch_list_t *patch, byte *start_pc)
{
    uint i;
    /* FIXME: can get this as a patch list entry through indirection */
    per_thread_t *pt = (per_thread_t *) dcontext->fragment_field;
    ASSERT(dcontext != GLOBAL_DCONTEXT && dcontext != NULL);

    LOG(THREAD, LOG_EMIT, 2, "patch_emitted_code start_pc="PFX" pt="PFX"\n",
        start_pc);
    if (patch->type != PATCH_TYPE_ABSOLUTE) {
        LOG(THREAD, LOG_EMIT, 2,
            "patch_emitted_code type=%d indirected, nothing to patch\n", patch->type);
        /* FIXME: propagate the check earlier to save the extraneous calls
           to update_indirect_exit_stub and update_indirect_branch_lookup
        */
        return;
    }
    DOLOG(4, LOG_EMIT, {
        print_patch_list(patch);
    });
    for(i=0; i<patch->num_relocations; i++) {
        byte *pc = start_pc + patch->entry[i].where.offset;
        /* value address, (think for example of pt->trace.hash_mask) */
        ptr_uint_t value;
        char *vaddr = NULL;
        if (TEST(PATCH_PER_THREAD, patch->entry[i].patch_flags)) {
            vaddr = (char *)pt + patch->entry[i].value_location_offset;
        } else if (TEST(PATCH_UNPROT_STAT, patch->entry[i].patch_flags)) {
            /* separate the two parts of the stat */
            uint unprot_offs = (uint) (patch->entry[i].value_location_offset) >> 16;
            uint field_offs = (uint) (patch->entry[i].value_location_offset) & 0xffff;
            IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint
                          (patch->entry[i].value_location_offset)));
            vaddr = (*((char **)((char *)pt + unprot_offs))) + field_offs;
            LOG(THREAD, LOG_EMIT, 4,
                "patch_emitted_code [%d] value "PFX" => 0x%x 0x%x => "PFX"\n",
                i, patch->entry[i].value_location_offset, unprot_offs, field_offs, vaddr);
        }
        else
            ASSERT_NOT_REACHED();
        ASSERT(TEST(PATCH_OFFSET_VALID, patch->entry[i].patch_flags));
        ASSERT(!TEST(PATCH_MARKER, patch->entry[i].patch_flags));

        if (!TEST(PATCH_TAKE_ADDRESS, patch->entry[i].patch_flags)) {
            /* use value pointed by computed address */
            if (TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags))
                value = (ptr_uint_t) *((uint *)vaddr);
            else
                value = *(ptr_uint_t*)vaddr;
        } else {
            ASSERT(!TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags));
            value = (ptr_uint_t)vaddr;   /* use computed address */
        }

        LOG(THREAD, LOG_EMIT, 4,
            "patch_emitted_code [%d] offset="PFX" patch_flags=%d value_offset="PFX
            " vaddr="PFX" value="PFX"\n", i,
            patch->entry[i].where.offset, patch->entry[i].patch_flags,
            patch->entry[i].value_location_offset, vaddr, value);
        if (TEST(PATCH_UINT_SIZED, patch->entry[i].patch_flags)) {
            IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(value)));
            *((uint*)pc) = (uint) value;
        } else
            *((ptr_uint_t *)pc) = value;
        LOG(THREAD, LOG_EMIT, 4,
            "patch_emitted_code: updated pc *"PFX" = "PFX"\n", pc, value);
    }

    STATS_INC(emit_patched_fragments);
    DOSTATS({
        /* PR 217008: avoid gcc warning from truncation assert in XSTATS_ADD_DC */
        int tmp_num = patch->num_relocations;
        STATS_ADD(emit_patched_relocations, tmp_num);
    });
    LOG(THREAD, LOG_EMIT, 4, "patch_emitted_code done\n");
}


/* Updates an indirect branch exit stub with the latest hashtable mask
 * and hashtable address
 * See also update_indirect_branch_lookup
 */
void
update_indirect_exit_stub(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
    generated_code_t *code = get_emitted_routines_code
        (dcontext _IF_X64(FRAGMENT_GENCODE_MODE(f->flags)));
# ifdef CUSTOM_EXIT_STUBS
    byte *start_pc = (byte *) EXIT_FIXED_STUB_PC(dcontext, f, l);
# else
    byte *start_pc = (byte *) EXIT_STUB_PC(dcontext, f, l);
# endif
    ibl_branch_type_t branch_type;

    ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
    ASSERT(LINKSTUB_INDIRECT(l->flags));
    ASSERT(EXIT_HAS_STUB(l->flags, f->flags));
    /* Shared use indirection so no patching needed -- caller should check */
    ASSERT(!TEST(FRAG_SHARED, f->flags));
#ifdef WINDOWS
    /* Do not touch shared_syscall */
    if (EXIT_TARGET_TAG(dcontext, f, l) ==
        shared_syscall_routine_ex(dcontext _IF_X64(FRAGMENT_GENCODE_MODE(f->flags))))
        return;
#endif
    branch_type = extract_branchtype(l->flags);

    LOG(THREAD, LOG_EMIT, 4, "update_indirect_exit_stub: f->tag="PFX"\n",
        f->tag);

    if (DYNAMO_OPTION(disable_traces) && !code->bb_ibl[branch_type].ibl_head_is_inlined) {
        return;
    }

    if (TEST(FRAG_IS_TRACE, f->flags)) {
        ASSERT(code->trace_ibl[branch_type].ibl_head_is_inlined);
        patch_emitted_code(dcontext, &code->trace_ibl[branch_type].ibl_stub_patch, start_pc);
    } else {
        ASSERT(code->bb_ibl[branch_type].ibl_head_is_inlined);
        patch_emitted_code(dcontext, &code->bb_ibl[branch_type].ibl_stub_patch, start_pc);
    }
}

/*###########################################################################
 *
 * fragment_t Prefixes
 *
 * Two types: indirect branch target, which restores eflags and xcx, and
 * normal prefix, which just restores xcx
 */

/* Indirect Branch Target Prefix
 * We have 3 different prefixes: one if we don't need to restore eflags, one
 * if we need to restore just using sahf, and one if we also need to restore
 * the overflow flag OF.
 *
 * FIXME: currently we cache-align the prefix, not the normal
 * entry point...if prefix gets much longer, might want to add
 * nops to get normal entry cache-aligned?
 */

/* for now all ibl targets must use same scratch locations: tls or not, no mixture */

#define RESTORE_XAX_PREFIX(flags) \
    ((FRAG_IS_X86_TO_X64(flags) && \
      IF_X64_ELSE(DYNAMO_OPTION(x86_to_x64_ibl_opt), false)) ? \
     SIZE64_MOV_R8_TO_XAX : \
     (IBL_EFLAGS_IN_TLS() ? SIZE_MOV_XAX_TO_TLS(flags, false) : SIZE32_MOV_XAX_TO_ABS))
#define PREFIX_BASE(flags) \
    (RESTORE_XAX_PREFIX(flags) + FRAGMENT_BASE_PREFIX_SIZE(flags))


int
fragment_prefix_size(uint flags)
{
    if (use_ibt_prefix(flags)) {
        bool use_eflags_restore = TEST(FRAG_IS_TRACE, flags) ?
            !DYNAMO_OPTION(trace_single_restore_prefix) :
            !DYNAMO_OPTION(bb_single_restore_prefix);
        /* The common case is !INTERNAL_OPTION(unsafe_ignore_eflags*) so
         * PREFIX_BASE(flags) is defined accordingly, and we subtract from it to
         * get the correct value when the option is on.
         */
        if (INTERNAL_OPTION(unsafe_ignore_eflags_prefix)) {
            if (INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) {
                ASSERT(PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags) >= 0);
                return PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags);
            } else {
                /* still need to restore xax, just don't restore eflags */
                return PREFIX_BASE(flags);
            }
        }
        if (!use_eflags_restore)
            return PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags);
        if (TEST(FRAG_WRITES_EFLAGS_6, flags)) /* no flag restoration needed */
            return PREFIX_BASE(flags);
        else if (TEST(FRAG_WRITES_EFLAGS_OF, flags)) /* no OF restoration needed */
            return (PREFIX_BASE(flags) + PREFIX_SIZE_FIVE_EFLAGS);
        else /* must restore all 6 flags */
            if (INTERNAL_OPTION(unsafe_ignore_overflow)) {
                /* do not restore OF */
                return (PREFIX_BASE(flags) + PREFIX_SIZE_FIVE_EFLAGS);
            } else {
                return (PREFIX_BASE(flags) + PREFIX_SIZE_RESTORE_OF +
                        PREFIX_SIZE_FIVE_EFLAGS);
            }
    } else {
#ifdef CLIENT_INTERFACE
        if (dynamo_options.bb_prefixes)
            return FRAGMENT_BASE_PREFIX_SIZE(flags);
        else
#endif
            return 0;
    }
}

#ifdef PROFILE_RDTSC
/***************************************************************************
 ***************************************************************************
 ** PROFILING USING RDTSC
 **
 **/
/*
We want the profile code to not count towards fragment times.
So we stop time as quickly as possible, in assembly here instead of
in the profile_fragment_enter function, and start time again as late
as possible:
    mov %eax, eax_offset(dcontext) # save eax
    mov %edx, edx_offset(dcontext) # save edx
    rdtsc                          # stop time
    switch to dynamo stack
    pushfl                         # save eflags (call will clobber)
    mov %ecx, ecx_offset(dcontext) # save ecx
    pushl %edx                     # pass time as arg
    pushl %eax
    pushil &fragment_address       # pass &frag as arg
    call profile_fragment_enter    #
    addl $0xc, %esp                # clean up args
    popl %ecx                      # restore ecx
    popfl                          # restore eflags
    restore app stack
    rdtsc                          # start time
    movl %eax, start_time_OFFS(dcontext)   # store time value
    movl %edx, 4+start_time_OFFS(dcontext) # store time value
    mov eax_offset(dcontext), %eax   # restore eax
    mov edx_offset(dcontext), %edx   # restore edx
    mov ecx_offset(dcontext), %ecx   # restore ecx
*/

static uint profile_call_length = 0;
static int profile_call_fragment_offset = 0;
static int profile_call_call_offset = 0;
static byte profile_call_buf[128];
static dcontext_t *buffer_dcontext;
static void build_profile_call_buffer(void);

uint
profile_call_size()
{
    if (profile_call_length == 0)
        build_profile_call_buffer();
    return profile_call_length;
}

/* if insert_profile_call emits its code into the trace buffer, this
 * routine must be called once the fragment is created and the code is
 * in the fcache
 */
void
finalize_profile_call(dcontext_t *dcontext, fragment_t *f)
{
    byte *start_pc = (byte *) FCACHE_ENTRY_PC(f);
    byte *pc;
    byte *prev_pc;
    instr_t instr;
    instr_init(dcontext, &instr);

    /* fill in address of owning fragment now that that fragment exists */
    pc = start_pc + profile_call_fragment_offset;
    /* PR 248210: unsupported feature on x64 */
    IF_X64(ASSERT_NOT_IMPLEMENTED(false));
    *((int *)pc) = (uint)f;

    /* fill in call's proper pc-relative offset now that code is
     * in its final location in fcache
     */
    pc = start_pc + profile_call_call_offset;
    IF_X64(ASSERT_NOT_IMPLEMENTED(false));
    *((int *)pc) = (int)&profile_fragment_enter - (int)pc - 4;

    /* must fix up all dcontext references to point to the right dcontext */
    pc = start_pc;
    do {
        prev_pc = pc;
        instr_reset(dcontext, &instr);
        pc = decode(dcontext, pc, &instr);
        ASSERT(instr_valid(&instr)); /* our own code! */
        /* look for loads and stores that reference buffer_dcontext */
        if (instr_get_opcode(&instr) == OP_mov_ld &&
            opnd_is_near_base_disp(instr_get_src(&instr, 0)) &&
            opnd_get_base(instr_get_src(&instr, 0)) == REG_NULL &&
            opnd_get_index(instr_get_src(&instr, 0)) == REG_NULL) {
            /* if not really dcontext value, update_ will return old value */
            instr_set_src(&instr, 0,
                          update_dcontext_address(instr_get_src(&instr, 0),
                                                  buffer_dcontext, dcontext));
        }
        else if (instr_get_opcode(&instr) == OP_mov_st &&
                 opnd_is_near_base_disp(instr_get_dst(&instr, 0)) &&
                 opnd_get_base(instr_get_dst(&instr, 0)) == REG_NULL &&
                 opnd_get_index(instr_get_dst(&instr, 0)) == REG_NULL) {
            /* if not really dcontext value, update_ will return old value */
            instr_set_dst(&instr, 0,
                          update_dcontext_address(instr_get_dst(&instr, 0),
                                                  buffer_dcontext, dcontext));
        }
        if (!instr_raw_bits_valid(&instr)) {
            DEBUG_DECLARE(byte *nxt_pc;)
            DEBUG_DECLARE(nxt_pc = ) instr_encode(dcontext, &instr, prev_pc);
            ASSERT(nxt_pc != NULL);
        }
    } while (pc < start_pc + profile_call_length);
    instr_free(dcontext, &instr);
}


void
insert_profile_call(cache_pc start_pc)
{
    if (profile_call_length == 0)
        build_profile_call_buffer();
    memcpy((void *)start_pc, profile_call_buf, profile_call_length);
    /* if thread-private, we change to proper dcontext when finalizing */
}


/* This routine builds the profile call code using the instr_t
 * abstraction, then emits it into a buffer to be saved.
 * The code can then be directly copied whenever needed.
 * Assumption: this thread's dcontext must have been created
 * before calling this function.
 */
static void
build_profile_call_buffer()
{
    byte *pc, *nxt_pc;
    instrlist_t ilist;
    instr_t *inst;
    int start_time_offs;
    dcontext_t *dcontext = get_thread_private_dcontext();
    ASSERT(dcontext != NULL);
    /* remember dcontext for easy replacement when finalizing: */
    buffer_dcontext = dcontext;

    /* we require a dcontext to find this offset because it may
     * or may not be pushed to a quadword boundary, making it
     * hard to hardcode it
     */
    start_time_offs = (int)(&(dcontext->start_time)) - (int)dcontext;

    /* initialize the ilist */
    instrlist_init(&ilist);

    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EAX, SCRATCH_REG0_OFFS));
    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EDX, SCRATCH_REG3_OFFS));

    /* get time = rdtsc */
    APP(&ilist, INSTR_CREATE_rdtsc(dcontext));

    /* swap to dstack */
    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ESP, XSP_OFFSET));
    APP(&ilist, instr_create_restore_dynamo_stack(dcontext));

    /* finish saving caller-saved registers
     * The profile_fragment_enter function will save the callee-saved
     * regs (ebx, ebp, esi, edi) and will restore ebp and esp, but we need
     * to explicitly save eax, ecx, and edx
     */
    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS));

    /* save eflags (call will clobber) */
    APP(&ilist, INSTR_CREATE_RAW_pushf(dcontext));

#ifdef WINDOWS
    /* must preserve the LastErrorCode (if the profile procedure
     * calls a Win32 API routine it could overwrite the app's error code)
     * currently this is done in the profile routine itself --
     * if you want to move it here, look at the code in profile.c
     */
#endif

    /* push time as 2nd argument for call */
    APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_EDX)));
    APP(&ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_EAX)));

    /* push fragment address as 1st argument for call
     * fragment isn't built yet, we fill it in in finalize_profile_call
     */
    APP(&ilist, INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0)));

    /* call near rel: 4-byte pc-relative offset from start of next instr
     * we don't have that offset now so we fill it in later (in
     * finalize_profile_call)
     */
    APP(&ilist, INSTR_CREATE_call(dcontext, opnd_create_pc(NULL)));

    /* pop arguments: addl $0xc, %esp */
    APP(&ilist,
        INSTR_CREATE_add(dcontext, opnd_create_reg(REG_ESP), OPND_CREATE_INT8(0xc)));

    /* restore eflags */
    APP(&ilist, INSTR_CREATE_RAW_popf(dcontext));

    /* restore caller-saved registers */
    APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS));

    /* restore app stack */
    APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_ESP, XSP_OFFSET));

    /* get start time = rdtsc */
    APP(&ilist, INSTR_CREATE_rdtsc(dcontext));

    /* copy start time into dcontext */
    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EAX, start_time_offs));
    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EDX, start_time_offs+4));

    /* finish restoring caller-saved registers */
    APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_EDX, SCRATCH_REG3_OFFS));
    APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_EAX, SCRATCH_REG0_OFFS));

    /* now encode the instructions */
    pc = profile_call_buf;
    for (inst = instrlist_first(&ilist); inst; inst = instr_get_next(inst)) {
        if (instr_is_call_direct(inst)) {
            /* push_immed was just before us, so fragment address
             * starts 4 bytes before us:
             */
            profile_call_fragment_offset = (int) (pc - 4 - profile_call_buf);
            /* call opcode is 1 byte, offset is next: */
            profile_call_call_offset = (int) (pc + 1 - profile_call_buf);
        }
        /* we have no jumps with instr_t targets so we don't need to set note
         * field in order to use instr_encode
         */
        nxt_pc = instr_encode(dcontext, inst, (void*)pc);
        ASSERT(nxt_pc != NULL);
        profile_call_length += nxt_pc - pc;
        pc = nxt_pc;
        ASSERT(profile_call_length < 128);
    }

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);
}

#endif /* PROFILE_RDTSC */

#ifdef WINDOWS
# ifdef CLIENT_INTERFACE

/* Leaving in place old notes on LastError preservation: */
    /* inlined versions of save/restore last error by reading of TIB */
    /* If our inlined version fails on a later version of windows
       should verify [GS]etLastError matches the disassembly below.
    */
    /* Win2000: kernel32!SetLastError: */
    /*   77E87671: 55                 push        ebp */
    /*   77E87672: 8B EC              mov         ebp,esp */
    /*   77E87674: 64 A1 18 00 00 00  mov         eax,fs:[00000018] */
    /*   77E8767A: 8B 4D 08           mov         ecx,dword ptr [ebp+8] */
    /*   77E8767D: 89 48 34           mov         dword ptr [eax+34h],ecx */
    /*   77E87680: 5D                 pop         ebp */
    /*   77E87681: C2 04 00           ret         4 */

    /* Win2003: ntdll!RtlSetLastWin32Error: optimized to */
    /*   77F45BB4: 64 A1 18 00 00 00  mov         eax,fs:[00000018] */
    /*   77F45BBA: 8B 4C 24 04        mov         ecx,dword ptr [esp+4] */
    /*   77F45BBE: 89 48 34           mov         dword ptr [eax+34h],ecx */
    /*   77F45BC1: C2 04 00           ret         4 */

    /* See InsideWin2k, p. 329 SelfAddr fs:[18h] simply has the linear address of the TIB
       while we're interested only in LastError which is at fs:[34h] */
    /* Therefore all we need is a single instruction! */
    /* 64 a1 34 00 00 00  mov         dword ptr fs:[34h],errno_register */
    /* Overall savings: 7 instructions, 5 data words */

    /*kernel32!GetLastError:*/
    /*   77E87684: 64 A1 18 00 00 00  mov         eax,fs:[00000018] */
    /*   77E8768A: 8B 40 34           mov         eax,dword ptr [eax+34h] */
    /*   77E8768D: C3                 ret */

    /* All we need is a single instruction: */
    /*  77F45BBE: 89 48 34           mov         reg_result, dword ptr fs:[34h] */

/* i#249: isolate app's PEB+TEB by keeping our own copy and swapping on cxt switch
 * For clean calls we share this in clean_call_{save,restore} (i#171, i#1349).
 */
void
preinsert_swap_peb(dcontext_t *dcontext, instrlist_t *ilist, instr_t *next,
                   bool absolute, reg_id_t reg_dr, reg_id_t reg_scratch, bool to_priv)
{
    /* We assume PEB is globally constant and we don't need per-thread pointers
     * and can use use absolute pointers known at init time
     */
    PEB *tgt_peb = to_priv ? get_private_peb() : get_own_peb();
    reg_id_t scratch32 = IF_X64_ELSE(reg_64_to_32(reg_scratch), reg_scratch);
    ASSERT(INTERNAL_OPTION(private_peb) && should_swap_peb_pointer());
    ASSERT(reg_dr != REG_NULL && reg_scratch != REG_NULL);
    /* can't store 64-bit immed, so we use scratch reg, for 32-bit too since
     * long 32-bit-immed-store instr to fs:offs is slow to decode
     */
    PRE(ilist, next, INSTR_CREATE_mov_imm
        (dcontext, opnd_create_reg(reg_scratch), OPND_CREATE_INTPTR((ptr_int_t)tgt_peb)));
    PRE(ilist, next, XINST_CREATE_store
        (dcontext, opnd_create_far_base_disp
         (SEG_TLS, REG_NULL, REG_NULL, 0, PEB_TIB_OFFSET, OPSZ_PTR),
         opnd_create_reg(reg_scratch)));

    /* Preserve app's TEB->LastErrorValue.  We used to do this separately b/c
     * DR at one point long ago made some win32 API calls: now we only have to
     * do this when loading private libraries.  We assume no private library
     * code needs to preserve LastErrorCode across app execution.
     */
    if (to_priv) {
        /* yes errno is 32 bits even on x64 */
        PRE(ilist, next, XINST_CREATE_load
            (dcontext, opnd_create_reg(scratch32), opnd_create_far_base_disp
             (SEG_TLS, REG_NULL, REG_NULL, 0, ERRNO_TIB_OFFSET, OPSZ_4)));
        PRE(ilist, next, SAVE_TO_DC_VIA_REG
            (absolute, dcontext, reg_dr, scratch32, APP_ERRNO_OFFSET));
    } else {
        PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
            (absolute, dcontext, reg_dr, scratch32, APP_ERRNO_OFFSET));
        PRE(ilist, next, XINST_CREATE_store
            (dcontext, opnd_create_far_base_disp
             (SEG_TLS, REG_NULL, REG_NULL, 0, ERRNO_TIB_OFFSET, OPSZ_4),
             opnd_create_reg(scratch32)));
    }

#ifdef X64
    /* We have to swap TEB->StackLimit (i#1102).  For now I'm only doing this
     * on X64, though it seems possible for 32-bit stacks to be up high too?
     * We have never seen that.
     */
    if (to_priv) {
        PRE(ilist, next, XINST_CREATE_load
            (dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp
             (SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR)));
        PRE(ilist, next, SAVE_TO_DC_VIA_REG
            (absolute, dcontext, reg_dr, reg_scratch, APP_STACK_LIMIT_OFFSET));
        PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
            (absolute, dcontext, reg_dr, reg_scratch, DSTACK_OFFSET));
        PRE(ilist, next, INSTR_CREATE_lea
            (dcontext, opnd_create_reg(reg_scratch),
             opnd_create_base_disp(reg_scratch, REG_NULL, 0,
                                   -(int)DYNAMORIO_STACK_SIZE, OPSZ_lea)));
        PRE(ilist, next, XINST_CREATE_store
            (dcontext, opnd_create_far_base_disp
             (SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR),
             opnd_create_reg(reg_scratch)));
    } else {
        PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
            (absolute, dcontext, reg_dr, reg_scratch, APP_STACK_LIMIT_OFFSET));
        PRE(ilist, next, XINST_CREATE_store
            (dcontext, opnd_create_far_base_disp
             (SEG_TLS, REG_NULL, REG_NULL, 0, BASE_STACK_TIB_OFFSET, OPSZ_PTR),
             opnd_create_reg(reg_scratch)));
    }
#endif

    /* We also swap TEB->NlsCache.  Unlike TEB->ProcessEnvironmentBlock, which is
     * constant, and TEB->LastErrorCode, which is not peristent, we have to maintain
     * both values and swap between them which is expensive.
     */
    PRE(ilist, next, XINST_CREATE_load
        (dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp
         (SEG_TLS, REG_NULL, REG_NULL, 0, NLS_CACHE_TIB_OFFSET, OPSZ_PTR)));
    PRE(ilist, next, SAVE_TO_DC_VIA_REG
        (absolute, dcontext, reg_dr, reg_scratch,
         to_priv ? APP_NLS_CACHE_OFFSET : PRIV_NLS_CACHE_OFFSET));
    PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
        (absolute, dcontext, reg_dr, reg_scratch,
         to_priv ? PRIV_NLS_CACHE_OFFSET : APP_NLS_CACHE_OFFSET));
    PRE(ilist, next, XINST_CREATE_store
        (dcontext, opnd_create_far_base_disp
         (SEG_TLS, REG_NULL, REG_NULL, 0, NLS_CACHE_TIB_OFFSET, OPSZ_PTR),
         opnd_create_reg(reg_scratch)));
    /* We also swap TEB->FlsData.  Unlike TEB->ProcessEnvironmentBlock, which is
     * constant, and TEB->LastErrorCode, which is not peristent, we have to maintain
     * both values and swap between them which is expensive.
     */
    PRE(ilist, next, XINST_CREATE_load
        (dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp
         (SEG_TLS, REG_NULL, REG_NULL, 0, FLS_DATA_TIB_OFFSET, OPSZ_PTR)));
    PRE(ilist, next, SAVE_TO_DC_VIA_REG
        (absolute, dcontext, reg_dr, reg_scratch,
         to_priv ? APP_FLS_OFFSET : PRIV_FLS_OFFSET));
    PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
        (absolute, dcontext, reg_dr, reg_scratch,
         to_priv ? PRIV_FLS_OFFSET : APP_FLS_OFFSET));
    PRE(ilist, next, XINST_CREATE_store
        (dcontext, opnd_create_far_base_disp
         (SEG_TLS, REG_NULL, REG_NULL, 0, FLS_DATA_TIB_OFFSET, OPSZ_PTR),
         opnd_create_reg(reg_scratch)));
    /* We swap TEB->ReservedForNtRpc as well.  Hopefully there won't be many
     * more we'll have to swap.
     */
    PRE(ilist, next, XINST_CREATE_load
        (dcontext, opnd_create_reg(reg_scratch), opnd_create_far_base_disp
         (SEG_TLS, REG_NULL, REG_NULL, 0, NT_RPC_TIB_OFFSET, OPSZ_PTR)));
    PRE(ilist, next, SAVE_TO_DC_VIA_REG
        (absolute, dcontext, reg_dr, reg_scratch,
         to_priv ? APP_RPC_OFFSET : PRIV_RPC_OFFSET));
    PRE(ilist, next, RESTORE_FROM_DC_VIA_REG
        (absolute, dcontext, reg_dr, reg_scratch,
         to_priv ? PRIV_RPC_OFFSET : APP_RPC_OFFSET));
    PRE(ilist, next, XINST_CREATE_store
        (dcontext, opnd_create_far_base_disp
         (SEG_TLS, REG_NULL, REG_NULL, 0, NT_RPC_TIB_OFFSET, OPSZ_PTR),
         opnd_create_reg(reg_scratch)));
}
# endif /* CLIENT_INTERFACE */

#endif /* WINDOWS */

/***************************************************************************/
/*             THREAD-PRIVATE/SHARED ROUTINE GENERATION                    */
/***************************************************************************/

/* Export this in instr.h if it becomes useful elsewhere */
#ifdef X86
# ifdef X64
#  ifdef WINDOWS
#   define OPND_ARG1  opnd_create_reg(REG_RCX)
#  else
#   define OPND_ARG1  opnd_create_reg(REG_RDI)
#  endif /* Win/Unix */
# else
#  define OPND_ARG1   OPND_CREATE_MEM32(REG_ESP, 4)
# endif /* 64/32-bit */
#elif defined(ARM)
# define OPND_ARG1    opnd_create_reg(DR_REG_R0)
#endif /* X86/ARM */

/* register for holding dcontext on fcache enter/return */
#define REG_DCTXT SCRATCH_REG5

/* append instructions to setup fcache target
 *   if (!absolute)
 *     # put target somewhere we can be absolute about
 *     RESTORE_FROM_UPCONTEXT next_tag_OFFSET,%xax
 *     if (shared)
 *       mov  %xax,fs:xax_OFFSET
 *     endif
 *   endif
 */
static void
append_setup_fcache_target(dcontext_t *dcontext, instrlist_t *ilist,
                           bool absolute, bool shared)
{
    if (absolute)
        return;

    /* put target into special slot that we can be absolute about */
    APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG0, NEXT_TAG_OFFSET));
    if (shared) {
        APP(ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG0, FCACHE_ENTER_TARGET_SLOT));
    } else {
#ifdef WINDOWS
        /* absolute into main dcontext (not one in REG_DCTXT) */
        APP(ilist, instr_create_save_to_dcontext(dcontext, SCRATCH_REG0,
                                                 NONSWAPPED_SCRATCH_OFFSET));
#else
        /* no special scratch slot! */
        ASSERT_NOT_IMPLEMENTED(false);
#endif /* !WINDOWS */
    }
}

/* append instructions to jump to target in code cache
 *  ifdef X64 and (target is x86 mode)
 *    # we can't indirect through a register since we couldn't restore
 *    # the high bits (PR 283152)
 *    mov gencode-jmp86-value, fs:xbx_OFFSET
 *    far jmp to next instr, stored w/ 32-bit cs selector in fs:xbx_OFFSET
 *  endif
 *
 *  # jump indirect through dcontext->next_tag, set by dispatch()
 *  if (absolute)
 *    JUMP_VIA_DCONTEXT next_tag_OFFSET
 *  else
 *    if (shared)
 *      jmp *fs:xax_OFFSET
 *    else
 *      JUMP_VIA_DCONTEXT nonswapped_scratch_OFFSET
 *    endif
 *  endif
 */
static void
append_jmp_to_fcache_target(dcontext_t *dcontext, instrlist_t *ilist,
                            generated_code_t *code,
                            bool absolute, bool shared, patch_list_t *patch
                            _IF_X64(byte **jmp86_store_addr)
                            _IF_X64(byte **jmp86_target_addr))
{
#ifdef X86_64
    if (GENCODE_IS_X86(code->gencode_mode)) {
        instr_t *label = INSTR_CREATE_label(dcontext);
        instr_t *store;
        /* We must use an indirect jmp (far direct are illegal in x64) and
         * we can't indirect through a register since we couldn't restore the
         * high bits (PR 283152) so we write the 6-byte far address to TLS.
         */
        /* AMD only supports 32-bit address for far jmp */
        store = XINST_CREATE_store(dcontext,
                                   OPND_TLS_FIELD_SZ(TLS_SLOT_REG1, OPSZ_4),
                                   OPND_CREATE_INT32(0/*placeholder*/));
        APP(ilist, store);
        APP(ilist, XINST_CREATE_store(dcontext,
                                      OPND_TLS_FIELD_SZ(TLS_SLOT_REG1+4, OPSZ_2),
                                      OPND_CREATE_INT16((ushort)CS32_SELECTOR)));
        APP(ilist, INSTR_CREATE_jmp_far_ind(dcontext,
                                            OPND_TLS_FIELD_SZ(TLS_SLOT_REG1, OPSZ_6)));
        APP(ilist, label);
        /* We need a patch that involves two instrs, which is not supported,
         * so we get both addresses involved into local vars and do the patch
         * by hand after emitting.
         */
        add_patch_marker(patch, store, PATCH_ASSEMBLE_ABSOLUTE,
                         -4 /* 4 bytes from end */, (ptr_uint_t*)jmp86_store_addr);
        add_patch_marker(patch, label, PATCH_ASSEMBLE_ABSOLUTE,
                         0 /* start of label */, (ptr_uint_t*)jmp86_target_addr);
    }
#endif /* X64 */

    /* Jump indirect through next_tag.  Dispatch set this value with
     * where we want to go next in the fcache_t.
     */
    if (absolute) {
        APP(ilist, instr_create_jump_via_dcontext(dcontext, NEXT_TAG_OFFSET));
    } else {
        if (shared) {
            /* next_tag placed into tls slot earlier in this routine */
            APP(ilist,
                XINST_CREATE_jump_mem(dcontext,
                                      OPND_TLS_FIELD(FCACHE_ENTER_TARGET_SLOT)));

        } else {
#ifdef WINDOWS
            /* FIXME: we could just use tls, right?  no real need for the "shared"
             * parameter?
             */
            /* need one absolute ref using main dcontext (not one in edi):
             * it's the final jmp, using the special slot we set up earlier
             */
            APP(ilist, instr_create_jump_via_dcontext(dcontext,
                                                      NONSWAPPED_SCRATCH_OFFSET));
#else /* !WINDOWS */
            /* no special scratch slot! */
            ASSERT_NOT_IMPLEMENTED(false);
#endif /* !WINDOWS */
        }
    }
}

/* Our context switch to and from the fragment cache are arranged such
 * that there is no persistent state kept on the dstack, allowing us to
 * start with a clean slate on exiting the cache.  This eliminates the
 * need to protect our dstack from inadvertent or malicious writes.
 *
 * We do not bother to save any DynamoRIO state, even the eflags.  We clear
 * them in fcache_return, assuming that a cleared state is always the
 * proper value (df is never set across the cache, etc.)
 *
 * The code is split into several helper functions.
 *
 * # Used by dispatch to begin execution in fcache at dcontext->next_tag
 * fcache_enter(dcontext_t *dcontext)
 *
 *  if (!absolute)
 *      mov    ARG1, SCRATCH_REG5 # dcontext param
 *    if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
 *      RESTORE_FROM_UPCONTEXT PROT_OFFSET, %xsi
 *    endif
 *  endif
 *
 *  # append_setup_fcache_target
 *  if (!absolute)
 *      # put target somewhere we can be absolute about
 *      RESTORE_FROM_UPCONTEXT next_tag_OFFSET, SCRATCH_REG0
 *    if (shared)
 *      mov  SCRATCH_REG0, fs:xax_OFFSET
 *    endif
 *  endif
 *
 *  # append_call_exit_dr_hook
 *  if (EXIT_DR_HOOK != NULL && !dcontext->ignore_enterexit)
 *    if (!absolute)
 *      push    %xdi
 *      push    %xsi
 *    else
 *      # support for skipping the hook
 *      RESTORE_FROM_UPCONTEXT ignore_enterexit_OFFSET,%edi
 *      cmpl    %edi,0
 *      jnz     post_hook
 *    endif
 *      call    EXIT_DR_HOOK # for x64 windows, reserve 32 bytes stack space for call
 *    if (!absolute)
 *      pop    %xsi
 *      pop    %xdi
 *    endif
 *  endif
 *
 *  post_hook:
 *
 *  # restore the original register state
 *
 *  # append_restore_xflags
 *  RESTORE_FROM_UPCONTEXT xflags_OFFSET,%xax
 *  push    %xax
 *  popf            # restore eflags temporarily using dstack
 *
 *  # append_restore_simd_reg
 *  if preserve_xmm_caller_saved
 *    RESTORE_FROM_UPCONTEXT xmm_OFFSET+0*16,%xmm0
 *    RESTORE_FROM_UPCONTEXT xmm_OFFSET+1*16,%xmm1
 *    RESTORE_FROM_UPCONTEXT xmm_OFFSET+2*16,%xmm2
 *    RESTORE_FROM_UPCONTEXT xmm_OFFSET+3*16,%xmm3
 *    RESTORE_FROM_UPCONTEXT xmm_OFFSET+4*16,%xmm4
 *    RESTORE_FROM_UPCONTEXT xmm_OFFSET+5*16,%xmm5
 *    RESTORE_FROM_UPCONTEXT xmm_OFFSET+6*16,%xmm6  # 32-bit Linux
 *    RESTORE_FROM_UPCONTEXT xmm_OFFSET+7*16,%xmm7  # 32-bit Linux
 *  endif
 *
 *  # append_restore_gpr
 *  ifdef X64
 *    RESTORE_FROM_UPCONTEXT r8_OFFSET,%r8
 *    RESTORE_FROM_UPCONTEXT r9_OFFSET,%r9
 *    RESTORE_FROM_UPCONTEXT r10_OFFSET,%r10
 *    RESTORE_FROM_UPCONTEXT r11_OFFSET,%r11
 *    RESTORE_FROM_UPCONTEXT r12_OFFSET,%r12
 *    RESTORE_FROM_UPCONTEXT r13_OFFSET,%r13
 *    RESTORE_FROM_UPCONTEXT r14_OFFSET,%r14
 *    RESTORE_FROM_UPCONTEXT r15_OFFSET,%r15
 *  endif
 *    RESTORE_FROM_UPCONTEXT xax_OFFSET,%xax
 *    RESTORE_FROM_UPCONTEXT xbx_OFFSET,%xbx
 *    RESTORE_FROM_UPCONTEXT xcx_OFFSET,%xcx
 *    RESTORE_FROM_UPCONTEXT xdx_OFFSET,%xdx
 *  if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
 *    RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xsi
 *  endif
 *  if (absolute || TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
 *    RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xdi
 *  endif
 *    RESTORE_FROM_UPCONTEXT xbp_OFFSET,%xbp
 *    RESTORE_FROM_UPCONTEXT xsp_OFFSET,%xsp
 *  if (!absolute)
 *    if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
 *      RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xsi
 *    else
 *      RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xdi
 *    endif
 *  endif
 *
 *  # append_jmp_to_fcache_target
 *  ifdef X64 and (target is x86 mode)
 *    # we can't indirect through a register since we couldn't restore
 *    # the high bits (PR 283152)
 *    mov gencode-jmp86-value, fs:xbx_OFFSET
 *    far jmp to next instr, stored w/ 32-bit cs selector in fs:xbx_OFFSET
 *  endif
 *
 *  # jump indirect through dcontext->next_tag, set by dispatch()
 *  if (absolute)
 *    JUMP_VIA_DCONTEXT next_tag_OFFSET
 *  else
 *    if (shared)
 *      jmp *fs:xax_OFFSET
 *    else
 *      JUMP_VIA_DCONTEXT nonswapped_scratch_OFFSET
 *    endif
 *  endif
 *
 *  # now executing in fcache
 */
static byte *
emit_fcache_enter_common(dcontext_t *dcontext, generated_code_t *code,
                         byte *pc, bool absolute, bool shared)
{
    int len;
    instrlist_t ilist;
    patch_list_t patch;
#ifdef X64
    byte *jmp86_store_addr = NULL;
    byte *jmp86_target_addr = NULL;
#endif /* X64 */

    init_patch_list(&patch, absolute ? PATCH_TYPE_ABSOLUTE : PATCH_TYPE_INDIRECT_XDI);
    instrlist_init(&ilist);

    /* no support for absolute addresses on x64/ARM: we always use tls */
    IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute && shared));
    IF_ARM(ASSERT_NOT_IMPLEMENTED(!absolute && shared));

    if (!absolute) {
        /* grab gen routine's parameter dcontext and put it into edi */
        APP(&ilist,
            IF_X86_ELSE(XINST_CREATE_load, XINST_CREATE_move)
            (dcontext, opnd_create_reg(SCRATCH_REG5), OPND_ARG1));
        if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
            IF_X86_ELSE({
                APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG4, PROT_OFFS));
            }, {
                /* FIXME i#1551: SELFPROT is not supported on ARM */
                ASSERT_NOT_REACHED();
            });
        }
    }

    append_setup_fcache_target(dcontext, &ilist, absolute, shared);
    append_call_exit_dr_hook(dcontext, &ilist, absolute, shared);

#if defined(WINDOWS) && defined(CLIENT_INTERFACE)
    /* i#249: isolate the PEB */
    if (INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()) {
        preinsert_swap_peb(dcontext, &ilist, NULL, absolute, SCRATCH_REG5,
                           SCRATCH_REG0/*scratch*/, false/*to app*/);
    }
#endif

    /* restore the original register state */
    append_restore_xflags(dcontext, &ilist, absolute);
    append_restore_simd_reg(dcontext, &ilist, absolute);
    append_restore_gpr(dcontext, &ilist, absolute);
    append_jmp_to_fcache_target(dcontext, &ilist, code, absolute, shared, &patch
                                _IF_X64(&jmp86_store_addr)
                                _IF_X64(&jmp86_target_addr));

    /* now encode the instructions */
    len = encode_with_patch_list(dcontext, &patch, &ilist, pc);
    ASSERT(len != 0);

#ifdef X64
    if (GENCODE_IS_X86(code->gencode_mode)) {
        /* Put the absolute address in place */
        ASSERT(jmp86_target_addr != NULL && jmp86_store_addr != NULL);
        ASSERT(CHECK_TRUNCATE_TYPE_uint((ptr_uint_t)jmp86_target_addr));
        *((uint *)jmp86_store_addr) = (uint)(ptr_uint_t)jmp86_target_addr;
    }
#endif

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc + len;
}

byte *
emit_fcache_enter(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
    return emit_fcache_enter_common(dcontext, code, pc,
                                    true/*absolute*/, false/*!shared*/);
}

/* Generate a shared prologue for grabbing the dcontext into XDI

   TODO: Should be used by fcache_return and shared IBL routines,
   but for now some assumptions are not quite the same.

   Only assumption is that xcx cannot be touched (IBL expects looked up address)
       if save_xdi we assume DCONTEXT_BASE_SPILL_SLOT can be clobbered

   OUTPUT: xdi contains dcontext
       if save_xdi DCONTEXT_BASE_SPILL_SLOT will contain saved value
   FIXME: xdx is the spill slot -- switch over to xdx as base reg?
       Have to measure perf effect first (case 5239)

    00:   mov xdi, tls_slot_scratch2   64 89 3d 0c 0f 00 00 mov    %edi -> %fs:0xf0c
    07:   mov tls_slot_dcontext, xdi   64 8b 3d 14 0f 00 00 mov    %fs:0xf14 -> %edi
  if TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)
     ASSERT_NOT_TESTED
  endif
*/
void
insert_shared_get_dcontext(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
                           bool save_xdi)
{
    /* needed to support grabbing the dcontext w/ shared cache */
    if (save_xdi) {
        PRE(ilist, where, SAVE_TO_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/,
                                      DCONTEXT_BASE_SPILL_SLOT));
    }
    PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/,
                                       TLS_DCONTEXT_SLOT));
    if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
#ifdef X86
        bool absolute = false;
        /* PR 224798: we could avoid extra indirection by storing
         * unprotected_context_t in TLS_DCONTEXT_SLOT instead of dcontext_t
         */
        ASSERT_NOT_TESTED();
        /* we'd need a 3rd slot in order to nicely get unprot ptr into esi
         * we can do it w/ only 2 slots by clobbering dcontext ptr
         * (we could add base reg info to RESTORE_FROM_DC/SAVE_TO_DC and go
         * straight through esi to begin w/ and subtract one instr (xchg)
         */
        PRE(ilist, where, RESTORE_FROM_DC(dcontext, SCRATCH_REG5, PROT_OFFS));
        PRE(ilist, where, INSTR_CREATE_xchg(dcontext, opnd_create_reg(SCRATCH_REG4),
                                     opnd_create_reg(SCRATCH_REG5)));
        PRE(ilist, where, SAVE_TO_DC(dcontext, SCRATCH_REG5, SCRATCH_REG4_OFFS));
        PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, TLS_DCONTEXT_SLOT));
#elif defined(ARM)
        /* FIXMED i#1551: NYI on ARM */
        ASSERT_NOT_REACHED();
#endif
    }
}


/* restore XDI through TLS */
void
insert_shared_restore_dcontext_reg(dcontext_t *dcontext, instrlist_t *ilist,
                                   instr_t *where)
{
    PRE(ilist, where, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5/*xdi/r5*/,
                                       DCONTEXT_BASE_SPILL_SLOT));
}


/*  append instructions to prepare for fcache return:
 *  i.e., far jump to switch mode, load dcontext, etc.
 *
 *  # on X86
 *  ifdef X64 and (source is x86 mode)
 *    far direct jmp to next instr w/ 64-bit switch
 *  endif
 *
 *  if (!absolute)
 *    mov  %xdi,fs:xdx_OFFSET
 *    mov  fs:dcontext,%xdi
 *    if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
 *      RESTORE_FROM_DCONTEXT PROT_OFFSET,%xdi
 *      xchg   %xsi,%xdi
 *      SAVE_TO_UPCONTEXT %xdi,xsi_OFFSET
 *      mov    fs:dcontext,%xdi
 *    endif
 *    # get xax and xdi into their real slots, via xbx
 *    SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
 *    mov    fs:xax_OFFSET,%xbx
 *    SAVE_TO_UPCONTEXT %xbx,xax_OFFSET
 *    mov    fs:xdx_OFFSET,%xbx
 *    SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET
 *  endif
 */
static void
append_prepare_fcache_return(dcontext_t *dcontext, instrlist_t *ilist,
                             bool absolute, bool shared)
{
#ifdef X86_64
    if (GENCODE_IS_X86(code->gencode_mode)) {
        instr_t *label = INSTR_CREATE_label(dcontext);
        instr_t *ljmp = INSTR_CREATE_jmp_far
            (dcontext, opnd_create_far_instr(CS64_SELECTOR, label));
        instr_set_x86_mode(ljmp, true/*x86*/);
        APP(ilist, ljmp);
        APP(ilist, label);
    }
#endif /* X86_64 */

    if (absolute)
        return;

    /* only support non-absolute w/ shared cache */
    ASSERT_NOT_IMPLEMENTED(shared);
    /* xax is in 1 scratch slot, so we have to use a 2nd scratch
     * slot in order to get dcontext into xdi
     */
    APP(ilist, SAVE_TO_TLS(dcontext, REG_DCTXT, DCONTEXT_BASE_SPILL_SLOT));
    APP(ilist, RESTORE_FROM_TLS(dcontext, REG_DCTXT, TLS_DCONTEXT_SLOT));
    if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
#ifdef X86
        /* we'd need a 3rd slot in order to nicely get unprot ptr into xsi
         * we can do it w/ only 2 slots by clobbering dcontext ptr
         * (we could add base reg info to RESTORE_FROM_DC/SAVE_TO_DC and go
         * straight through xsi to begin w/ and subtract one instr (xchg)
         */
        ASSERT_NOT_TESTED();
        APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG5, PROT_OFFS));
        APP(ilist, INSTR_CREATE_xchg(dcontext, opnd_create_reg(SCRATCH_REG4),
                                     opnd_create_reg(SCRATCH_REG5)));
        APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG5, SCRATCH_REG4_OFFS));
        APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, TLS_DCONTEXT_SLOT));
#elif defined(ARM)
        /* FIXME i#1551: NYI on ARM */
        ASSERT_NOT_REACHED();
#endif /* X86/ARM */
    }
}

static void
append_call_dispatch(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
    /* call central dispatch routine */
    /* for x64 linux we could optimize and avoid the "mov rdi, rdi" */
    dr_insert_call((void *)dcontext, ilist, NULL/*append*/,
                   (void *)dispatch, 1,
                   absolute ?
                   OPND_CREATE_INTPTR((ptr_int_t)dcontext) : opnd_create_reg(REG_DCTXT));

    /* dispatch() shouldn't return! */
    insert_reachable_cti(dcontext, ilist, NULL, vmcode_get_start(),
                         (byte *)unexpected_return, true/*jmp*/, false/*!precise*/,
                         DR_REG_R11/*scratch*/, NULL);
}

/*
 * # fcache_return: context switch back to DynamoRIO.
 * # Invoked via
 * #     a) from the fcache via a fragment exit stub,
 * #     b) from indirect_branch_lookup().
 * # Invokes dispatch() with a clean dstack.
 * # Assumptions:
 * #     1) app's value in xax/r0 already saved in dcontext.
 * #     2) xax/r0 holds the linkstub ptr
 * #
 *
 * fcache_return:
 *  # append_prepare_fcache_return
 *  ifdef X64 and (source is x86 mode)
 *      far direct jmp to next instr w/ 64-bit switch
 *  endif
 *
 *  if (!absolute)
 *    mov  %xdi,fs:xdx_OFFSET
 *    mov  fs:dcontext,%xdi
 *    if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
 *      RESTORE_FROM_DCONTEXT PROT_OFFSET,%xdi
 *      xchg   %xsi,%xdi
 *      SAVE_TO_UPCONTEXT %xdi,xsi_OFFSET
 *      mov    fs:dcontext,%xdi
 *    endif
 *  endif
 *
 *  # append_save_gpr
 *  if (!absolute)
 *    # get xax and xdi into their real slots, via xbx
 *    SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
 *    mov    fs:xax_OFFSET,%xbx
 *    SAVE_TO_UPCONTEXT %xbx,xax_OFFSET
 *    mov    fs:xdx_OFFSET,%xbx
 *    SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET
 *  endif
 *
 *  # save the current register state to context->regs
 *  # xax already in context
 *
 *  if (absolute)
 *    SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
 *  endif
 *    SAVE_TO_UPCONTEXT %xcx,xcx_OFFSET
 *    SAVE_TO_UPCONTEXT %xdx,xdx_OFFSET
 *  if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
 *    SAVE_TO_UPCONTEXT %xsi,xsi_OFFSET
 *  endif
 *  if (absolute)
 *    SAVE_TO_UPCONTEXT %xdi,xdi_OFFSET
 *  endif
 *    SAVE_TO_UPCONTEXT %xbp,xbp_OFFSET
 *    SAVE_TO_UPCONTEXT %xsp,xsp_OFFSET
 *  ifdef X64
 *    SAVE_TO_UPCONTEXT %r8,r8_OFFSET
 *    SAVE_TO_UPCONTEXT %r9,r9_OFFSET
 *    SAVE_TO_UPCONTEXT %r10,r10_OFFSET
 *    SAVE_TO_UPCONTEXT %r11,r11_OFFSET
 *    SAVE_TO_UPCONTEXT %r12,r12_OFFSET
 *    SAVE_TO_UPCONTEXT %r13,r13_OFFSET
 *    SAVE_TO_UPCONTEXT %r14,r14_OFFSET
 *    SAVE_TO_UPCONTEXT %r15,r15_OFFSET
 *  endif
 *
 *  # append_save_simd_reg
 *  if preserve_xmm_caller_saved
 *    SAVE_TO_UPCONTEXT %xmm0,xmm_OFFSET+0*16
 *    SAVE_TO_UPCONTEXT %xmm1,xmm_OFFSET+1*16
 *    SAVE_TO_UPCONTEXT %xmm2,xmm_OFFSET+2*16
 *    SAVE_TO_UPCONTEXT %xmm3,xmm_OFFSET+3*16
 *    SAVE_TO_UPCONTEXT %xmm4,xmm_OFFSET+4*16
 *    SAVE_TO_UPCONTEXT %xmm5,xmm_OFFSET+5*16
 *    SAVE_TO_UPCONTEXT %xmm6,xmm_OFFSET+6*16  # 32-bit Linux
 *    SAVE_TO_UPCONTEXT %xmm7,xmm_OFFSET+7*16  # 32-bit Linux
 *  endif
 *
 *  # switch to clean dstack
 *  RESTORE_FROM_DCONTEXT dstack_OFFSET,%xsp
 *
 *  # append_save_clear_xflags
 *  # now save eflags -- too hard to do without a stack!
 *  pushf           # push eflags on stack
 *  pop     %xbx    # grab eflags value
 *  SAVE_TO_UPCONTEXT %xbx,xflags_OFFSET # save eflags value
 *
 *  # clear eflags now to avoid app's eflags messing up our ENTER_DR_HOOK
 *  # FIXME: this won't work at CPL0 if we ever run there!
 *  push  0
 *  popf
 *
 *  # append_call_enter_dr_hook
 *  if (ENTER_DR_HOOK != NULL && !dcontext->ignore_enterexit)
 *    # don't bother to save any registers around call except for xax
 *    # and xcx, which holds next_tag
 *    push    %xcx
 *    if (!absolute)
 *      push    %xdi
 *      push    %xsi
 *    endif
 *      push    %xax
 *    if (absolute)
 *      # support for skipping the hook (note: 32-bits even on x64)
 *      RESTORE_FROM_UPCONTEXT ignore_enterexit_OFFSET,%edi
 *      cmp     %edi,0
 *      jnz     post_hook
 *    endif
 *    # for x64 windows, reserve 32 bytes stack space for call prior to call
 *    call    ENTER_DR_HOOK
 *
 *   post_hook:
 *    pop     %xax
 *    if (!absolute)
 *      pop     %xsi
 *      pop     %xdi
 *    endif
 *      pop     %xcx
 *  endif
 *
 *  # save last_exit, currently in eax, into dcontext->last_exit
 *  SAVE_TO_DCONTEXT %xax,last_exit_OFFSET
 *
 *  .ifdef WINDOWS && CLIENT_INTERFACE
 *    swap_peb
 *  .endif
 *
 *  .ifdef SIDELINE
 *    # clear cur-trace field so we don't think cur trace is still running
 *    movl    $0, _sideline_trace
 *  .endif
 *
 *  # call central dispatch routine w/ dcontext as an argument
 *  if (absolute)
 *    push    <dcontext>
 *  else
 *    push     %xdi  # for x64, mov %xdi, ARG1
 *  endif
 *  call    dispatch # for x64 windows, reserve 32 bytes stack space for call
 *  # dispatch() shouldn't return!
 *  jmp     unexpected_return
 */

/* N.B.: this routine is used to generate both the regular fcache_return
 * and a slightly different copy that is used for the miss/unlinked paths
 * for indirect_branch_lookup for self-protection.
 * ibl_end should be true only for that end of the lookup routine.
 *
 * If linkstub != NULL, used for coarse fragments, this routine assumes that:
 * - app xax is still in %xax
 * - next target pc is in DIRECT_STUB_SPILL_SLOT tls
 * - linkstub is the linkstub_t to pass back to dispatch
 * - if coarse_info:
 *   - app xcx is in MANGLE_XCX_SPILL_SLOT
 *   - source coarse info is in %xcx
 *
 * We assume this routine does not use TLS slot FLOAT_PC_STATE_SLOT (TLS_SLOT_REG1).
 */
bool
append_fcache_return_common(dcontext_t *dcontext, generated_code_t *code,
                            instrlist_t *ilist, bool ibl_end,
                            bool absolute, bool shared, linkstub_t *linkstub,
                            bool coarse_info)
{
    bool instr_targets;

    /* no support for absolute addresses on x64: we always use tls */
    IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute && shared));

    /* currently linkstub is only used for coarse-grain exits */
    ASSERT(linkstub == NULL || !absolute);

    append_prepare_fcache_return(dcontext, ilist, absolute, shared);
    append_save_gpr(dcontext, ilist, ibl_end, absolute, code, linkstub, coarse_info);
    append_save_simd_reg(dcontext, ilist, absolute);

    /* Switch to a clean dstack as part of our scheme to avoid state kept
     * unprotected across cache executions.
     * FIXME: this isn't perfect: we switch to the dstack BEFORE we call
     * the entrance hook that will be used to coordinate other threads,
     * so if our hook suspends all other threads to protect vs cross-thread
     * attacks, the dstack is not perfectly protected.
     */
    APP(ilist, RESTORE_FROM_DC(dcontext, REG_XSP, DSTACK_OFFSET));

    append_save_clear_xflags(dcontext, ilist, absolute);
    instr_targets = append_call_enter_dr_hook(dcontext, ilist, ibl_end, absolute);

    /* save last_exit, currently in scratch_reg0 into dcontext->last_exit */
    APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG0, LAST_EXIT_OFFSET));

#if defined(WINDOWS) && defined(CLIENT_INTERFACE)
    /* i#249: isolate the PEB */
    if (INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()) {
        preinsert_swap_peb(dcontext, ilist, NULL, absolute, SCRATCH_REG5,
                           SCRATCH_REG0/*scratch*/, true/*to priv*/);
    }
#endif /* WINDOWS && CLIENT_INTERFACE */

#ifdef SIDELINE
    if (dynamo_options.sideline) {
        /* clear cur-trace field so we don't think cur trace is still running */
        /* PR 248210: unsupported feature on x64 */
        IF_X64(ASSERT_NOT_IMPLEMENTED(false)); /* PR 244737: fix abs address */
        APP(ilist,
            XINST_CREATE_store(dcontext,
                               OPND_CREATE_MEM32(REG_NULL, (int)&sideline_trace),
                               OPND_CREATE_INT32(0)));
    }
#endif

    append_call_dispatch(dcontext, ilist, absolute);
    return instr_targets;
}

byte *
emit_fcache_return(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
    bool instr_targets;
    instrlist_t ilist;
    instrlist_init(&ilist);
    instr_targets = append_fcache_return_common(dcontext, code, &ilist,
                                                false/*!ibl_end*/,
                                                true/*absolute*/, false/*!shared*/,
                                                NULL, false/*not coarse*/);
    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, instr_targets);
    ASSERT(pc != NULL);
    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);
    return pc;
}

byte *
emit_fcache_enter_shared(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
    return emit_fcache_enter_common(dcontext, code, pc,
                                    false/*through xdi*/, true/*shared*/);
}

byte *
emit_fcache_return_shared(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
    bool instr_targets;
    instrlist_t ilist;
    instrlist_init(&ilist);
    instr_targets = append_fcache_return_common(dcontext, code, &ilist, false/*!ibl_end*/,
                                                false/*through xdi*/, true/*shared*/,
                                                NULL, false/*not coarse*/);
    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, instr_targets);
    ASSERT(pc != NULL);
    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);
    return pc;
}

byte *
emit_fcache_return_coarse(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
    bool instr_targets;
    linkstub_t *linkstub = (linkstub_t *) get_coarse_exit_linkstub();
    instrlist_t ilist;
    instrlist_init(&ilist);
    instr_targets = append_fcache_return_common(dcontext, code, &ilist, false/*!ibl_end*/,
                                                false/*through xdi*/, true/*shared*/,
                                                linkstub, true/*coarse info in xcx*/);
    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, instr_targets);
    ASSERT(pc != NULL);
    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);
    return pc;
}

byte *
emit_trace_head_return_coarse(dcontext_t *dcontext, generated_code_t *code, byte *pc)
{
    /* Could share tail end of coarse_fcache_return instead of duplicating */
    bool instr_targets;
    linkstub_t *linkstub = (linkstub_t *) get_coarse_trace_head_exit_linkstub();
    instrlist_t ilist;
    instrlist_init(&ilist);
    instr_targets = append_fcache_return_common(dcontext, code, &ilist, false/*!ibl_end*/,
                                                false/*through xdi*/, true/*shared*/,
                                                linkstub, false/*no coarse info*/);
    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, instr_targets);
    ASSERT(pc != NULL);
    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);
    return pc;
}

/* Our coarse entrance stubs have several advantages, such as eliminating
 * future fragments, but their accompanying lazy linking does need source
 * information that is not available in each stub.  We instead have an
 * unlinked entrance stub target a per-unit prefix that records the source
 * unit.  We can then search within the unit to identify the actual source
 * entrance stub, which is enough for lazy linking (but does not find the
 * unique source tag: case 8565).  This also gives us a single indirection
 * point in the form of the prefix at which to patch the fcache_return target.
 * We also place in the prefix indirection points for trace head cache exit and
 * the 3 coarse ibl targets, to keep the cache read-only and (again) make it
 * easier to patch when persisting/sharing.
 */
uint
coarse_exit_prefix_size(coarse_info_t *info)
{
#ifdef X64
    uint flags = COARSE_32_FLAG(info);
#endif
    /* FIXME: would be nice to use size calculated in emit_coarse_exit_prefix(),
     * but we need to know size before we emit and would have to do a throwaway
     * emit, or else set up a template to be patched w/ specific info field.
     * Also we'd have to unprot .data as we don't access this until post-init.
     */
    /* We don't need to require addr16: in fact it might be better to force
     * not using it, so if we persist on P4 but run on Core we don't lose
     * performance.  We have enough space.
     */
    return SIZE_MOV_XBX_TO_TLS(flags, false) + SIZE_MOV_PTR_IMM_TO_XAX(flags)
        + 5*JMP_LONG_LENGTH;
}

byte *
emit_coarse_exit_prefix(dcontext_t *dcontext, byte *pc, coarse_info_t *info)
{
    byte *ibl;
    DEBUG_DECLARE(byte *start_pc = pc;)
    instrlist_t ilist;
    patch_list_t patch;
    instr_t *fcache_ret_prefix;
#ifdef X64
    gencode_mode_t mode = FRAGMENT_GENCODE_MODE(COARSE_32_FLAG(info));
#endif

    instrlist_init(&ilist);
    init_patch_list(&patch, PATCH_TYPE_INDIRECT_FS);

    /* prefix looks like this, using xcx instead of xbx just to make
     * the fcache_return code simpler (as it already uses xbx early),
     * and using the info as we're doing per-cache and not per-unit:
     *
     *   fcache_return_coarse_prefix:
     *   6/9 mov  %xcx, MANGLE_XCX_SPILL_SLOT
     *  5/10 mov  <info ptr>, %xcx
     *     5 jmp fcache_return_coarse
     *   trace_head_return_coarse_prefix:
     *     5 jmp trace_head_return_coarse
     *       (if -disable_traces, it jmps to fcache_return_coarse_prefix instead)
     *   coarse_ibl_ret_prefix:
     *     5 jmp coarse_ibl_ret
     *   coarse_ibl_call_prefix:
     *     5 jmp coarse_ibl_call
     *   coarse_ibl_jmp_prefix:
     *     5 jmp coarse_ibl_jmp
     *
     * We assume that info ptr is at
     *   trace_head_return_prefix - JMP_LONG_LENGTH - 4
     * in patch_coarse_exit_prefix().
     * We assume that the ibl prefixes are nothing but jmps in
     * coarse_indirect_stub_jmp_target() so we can recover the ibl type.
     *
     * FIXME case 9647: on P4 our jmp->jmp sequence will be
     * elided, but on Core we may want to switch to a jmp*, though
     * since we have no register for a base ptr we'd need a reloc
     * entry for every single stub
     */
    /* entrance stub has put target_tag into xax-slot so we use xcx-slot */
    ASSERT(DIRECT_STUB_SPILL_SLOT != MANGLE_XCX_SPILL_SLOT);

    fcache_ret_prefix = INSTR_CREATE_label(dcontext);
    APP(&ilist, fcache_ret_prefix);

#ifdef X64
    if (TEST(PERSCACHE_X86_32, info->flags)) {
        /* XXX: this won't work b/c opnd size will be wrong */
        ASSERT_NOT_IMPLEMENTED(false && "must pass opnd size to SAVE_TO_TLS");
        APP(&ilist, SAVE_TO_TLS(dcontext, REG_ECX, MANGLE_XCX_SPILL_SLOT));
        /* We assume all our data structures are <4GB which is guaranteed for
         * WOW64 processes.
         */
        ASSERT(CHECK_TRUNCATE_TYPE_int((ptr_int_t)info));
        APP(&ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_ECX),
                                         OPND_CREATE_INT32((int)(ptr_int_t)info)));
    } else { /* default code */
        if (GENCODE_IS_X86_TO_X64(mode) && DYNAMO_OPTION(x86_to_x64_ibl_opt))
            APP(&ilist, SAVE_TO_REG(dcontext, SCRATCH_REG2, REG_R9));
        else
#endif
            APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG2/*xcx/r2*/,
                                    MANGLE_XCX_SPILL_SLOT));
        APP(&ilist, XINST_CREATE_load_int(dcontext,
                                          opnd_create_reg(SCRATCH_REG2/*xcx/r2*/),
                                          OPND_CREATE_INTPTR((ptr_int_t)info)));
#ifdef X64
    }
#endif
    APP(&ilist, XINST_CREATE_jump(dcontext,
        opnd_create_pc(get_direct_exit_target(dcontext,
                                              FRAG_SHARED | FRAG_COARSE_GRAIN |
                                              COARSE_32_FLAG(info)))));

    APP(&ilist, INSTR_CREATE_label(dcontext));
    add_patch_marker(&patch, instrlist_last(&ilist), PATCH_ASSEMBLE_ABSOLUTE,
                     0 /* start of instr */,
                     (ptr_uint_t*)&info->trace_head_return_prefix);
    if (DYNAMO_OPTION(disable_traces) ||
        /* i#670: the stub stored the abs addr at persist time.  we need
         * to adjust to the use-time mod base which we do in dispatch
         * but we need to set the dcontext->coarse_exit so we go through
         * the fcache return
         */
        (info->frozen && info->mod_shift != 0)) {
        /* trace_t heads need to store the info ptr for lazy linking */
        APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_instr(fcache_ret_prefix)));
    } else {
        APP(&ilist, XINST_CREATE_jump
            (dcontext, opnd_create_pc(trace_head_return_coarse_routine(IF_X64(mode)))));
    }

    /* coarse does not support IBL_FAR so we don't bother with get_ibl_entry_type() */
    ibl = get_ibl_routine_ex(dcontext, IBL_LINKED,
                             get_source_fragment_type(dcontext,
                                                      FRAG_SHARED | FRAG_COARSE_GRAIN),
                             IBL_RETURN _IF_X64(mode));
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(ibl)));
    add_patch_marker(&patch, instrlist_last(&ilist), PATCH_ASSEMBLE_ABSOLUTE,
                     0 /* start of instr */, (ptr_uint_t*)&info->ibl_ret_prefix);

    ibl = get_ibl_routine_ex(dcontext, IBL_LINKED,
                             get_source_fragment_type(dcontext,
                                                      FRAG_SHARED | FRAG_COARSE_GRAIN),
                             IBL_INDCALL _IF_X64(mode));
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(ibl)));
    add_patch_marker(&patch, instrlist_last(&ilist), PATCH_ASSEMBLE_ABSOLUTE,
                     0 /* start of instr */, (ptr_uint_t*)&info->ibl_call_prefix);

    ibl = get_ibl_routine_ex(dcontext, IBL_LINKED,
                             get_source_fragment_type(dcontext,
                                                      FRAG_SHARED | FRAG_COARSE_GRAIN),
                             IBL_INDJMP _IF_X64(mode));
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(ibl)));
    add_patch_marker(&patch, instrlist_last(&ilist), PATCH_ASSEMBLE_ABSOLUTE,
                     0 /* start of instr */, (ptr_uint_t*)&info->ibl_jmp_prefix);

    /* now encode the instructions */
    pc += encode_with_patch_list(dcontext, &patch, &ilist, pc);
    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);
    ASSERT((size_t)(pc - start_pc) == coarse_exit_prefix_size(info));

    DOLOG(3, LOG_EMIT, {
        byte *dpc = start_pc;
        LOG(GLOBAL, LOG_EMIT, 3, "\nprefixes for coarse unit %s:\n", info->module);
        do {
            if (dpc == info->fcache_return_prefix)
                LOG(GLOBAL, LOG_EMIT, 3, "fcache_return_coarse_prefix:\n");
            else if (dpc == info->trace_head_return_prefix)
                LOG(GLOBAL, LOG_EMIT, 3, "trace_head_return_coarse_prefix:\n");
            else if (dpc == info->ibl_ret_prefix)
                LOG(GLOBAL, LOG_EMIT, 3, "ibl_coarse_ret_prefix:\n");
            else if (dpc == info->ibl_call_prefix)
                LOG(GLOBAL, LOG_EMIT, 3, "ibl_coarse_call_prefix:\n");
            else if (dpc == info->ibl_jmp_prefix)
                LOG(GLOBAL, LOG_EMIT, 3, "ibl_coarse_jmp_prefix:\n");
            dpc = disassemble_with_bytes(dcontext, dpc, GLOBAL);
        } while (dpc < pc);
        LOG(GLOBAL, LOG_EMIT, 3, "\n");
    });

    return pc;
}

/* Update info pointer in exit prefixes */
void
patch_coarse_exit_prefix(dcontext_t *dcontext, coarse_info_t *info)
{
    ptr_uint_t *pc = (ptr_uint_t *)
        (info->trace_head_return_prefix - JMP_LONG_LENGTH - sizeof(info));
    *pc = (ptr_uint_t) info;
}


#ifdef HASHTABLE_STATISTICS
/* note that arch_thread_init is called before fragment_thread_init,
 * so these need to be updated
 */
/* When used in a thread-shared routine, this routine clobbers XDI. The
 * caller should spill & restore it or rematerialize it as needed. */
/* NOTE - this routine does NOT save the eflags, which will be clobbered by the
 * inc */
void
append_increment_counter(dcontext_t *dcontext, instrlist_t *ilist,
                         ibl_code_t *ibl_code, patch_list_t *patch,
                         reg_id_t entry_register, /* register indirect (XCX) or NULL */
                         /* adjusted to unprot_ht_statistics_t if no entry_register */
                         uint counter_offset,
                         reg_id_t scratch_register)
{
#ifdef X86
    instr_t *counter;
#endif
    bool absolute = !ibl_code->thread_shared_routine;
    /* no support for absolute addresses on x64: we always use tls/reg */
    IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute));

    if (!INTERNAL_OPTION(hashtable_ibl_stats))
        return;

    LOG(THREAD, LOG_EMIT, 3,
        "append_increment_counter: hashtable_stats_offset=0x%x counter_offset=0x%x\n",
        ibl_code->hashtable_stats_offset, counter_offset);

    if (entry_register == REG_NULL) {
        /* adjust offset within a unprot_ht_statistics_t structure */
        counter_offset += ibl_code->hashtable_stats_offset;
    }

    if (!absolute) {
        opnd_t counter_opnd;

        /* get dcontext in register (xdi) */
        insert_shared_get_dcontext(dcontext, ilist, NULL, false/* dead register */);
        /* XDI now has dcontext */
        APP(ilist, XINST_CREATE_load(dcontext,
                                     opnd_create_reg(SCRATCH_REG5/*xdi/r5*/),
                                     OPND_DC_FIELD(absolute, dcontext, OPSZ_PTR,
                                                   FRAGMENT_FIELD_OFFSET)));

        /* XDI now has per_thread_t structure */
        /* an extra step here: find the unprot_stats field in the fragment_table_t
         * could avoid for protect_mask==0 if we always had a copy
         * in the per_thread_t struct -- see fragment.h, not worth it
         */
        if (entry_register != REG_NULL) {
            APP(ilist, XINST_CREATE_load
                (dcontext, opnd_create_reg(SCRATCH_REG5/*xdi/r5*/),
                 OPND_CREATE_MEMPTR(SCRATCH_REG5/*xdi/r5*/,
                                    ibl_code->entry_stats_to_lookup_table_offset)));
            /* XDI should now have (entry_stats - lookup_table) value,
             * so we need [xdi+xcx] to get an entry reference
             */
            counter_opnd = opnd_create_base_disp(SCRATCH_REG5/*xdi/r5*/,
                                                 entry_register, 1,
                                                 counter_offset, OPSZ_4);
        } else {
            APP(ilist, XINST_CREATE_load
                (dcontext, opnd_create_reg(SCRATCH_REG5/*xdi/r5*/),
                 OPND_CREATE_MEMPTR(SCRATCH_REG5/*xdi/r5*/,
                                    ibl_code->unprot_stats_offset)));
            /* XDI now has unprot_stats structure */
            counter_opnd = OPND_CREATE_MEM32(SCRATCH_REG5/*xdi/r5*/, counter_offset);
        }

#ifdef X86
        counter = INSTR_CREATE_inc(dcontext, counter_opnd);
        APP(ilist, counter);
#elif defined(ARM)
        /* FIXMED i#1551: NYI on ARM */
        ASSERT_NOT_IMPLEMENTED(false);
#endif
    } else {
#ifdef X86
        /* TAKE_ADDRESS will in fact add the necessary base to the statistics structure,
           hence no explicit indirection needed here */
        opnd_t counter_opnd = OPND_CREATE_MEMPTR(entry_register, counter_offset);
        counter = INSTR_CREATE_inc(dcontext, counter_opnd);
        /* hack to get both this table's unprot offset and the specific stat's offs */
        ASSERT(counter_offset < USHRT_MAX);
        if (entry_register != REG_NULL) {
            /* although we currently don't use counter_offset,
             * it doesn't hurt to support as well
             */
            ASSERT(ibl_code->entry_stats_to_lookup_table_offset < USHRT_MAX);
            add_patch_entry(patch, counter, PATCH_UNPROT_STAT|PATCH_TAKE_ADDRESS,
                            (ibl_code->entry_stats_to_lookup_table_offset << 16) |
                            counter_offset);
        } else {
            ASSERT(ibl_code->unprot_stats_offset < USHRT_MAX);
            add_patch_entry(patch, counter, PATCH_UNPROT_STAT|PATCH_TAKE_ADDRESS,
                            (ibl_code->unprot_stats_offset << 16) | counter_offset);
        }
        APP(ilist, counter);
#elif defined(ARM)
        /* FIXMED i#1551: NYI on ARM */
        ASSERT_NOT_IMPLEMENTED(false);
#endif
    }
}
#endif /* HASHTABLE_STATISTICS */

#ifdef INTERNAL
/* add a slowdown loop to measure if a routine is likely to be on a critical path */
/* note that FLAGS are clobbered */
static void
append_empty_loop(dcontext_t *dcontext, instrlist_t *ilist,
                  uint iterations, reg_id_t scratch_register)
{
# ifdef X86
    instr_t *initloop;
    instr_t *loop;
    /*       mov     ebx, iterations */
    /* loop: dec     ebx */
    /*       jnz loop */
    ASSERT(REG_NULL != scratch_register);

    initloop = XINST_CREATE_load_int(dcontext,
                                     opnd_create_reg(scratch_register),
                                     OPND_CREATE_INT32(iterations));
    loop = INSTR_CREATE_dec(dcontext, opnd_create_reg(scratch_register));
    APP(ilist, initloop);
    APP(ilist, loop);
    APP(ilist, INSTR_CREATE_jcc(dcontext, OP_jnz_short, opnd_create_instr(loop)));
# elif defined(ARM)
        /* FIXMED i#1551: NYI on ARM */
        ASSERT_NOT_IMPLEMENTED(false);
# endif
}
#endif /* INTERNAL */

#ifdef X64
void
instrlist_convert_to_x86(instrlist_t *ilist)
{
    instr_t *in;
    for (in = instrlist_first(ilist); in != NULL; in = instr_get_next(in)) {
        instr_set_x86_mode(in, true/*x86*/);
        instr_shrink_to_32_bits(in);
    }
}
#endif

/* what we do on a hit in the hashtable */
/* Restore XBX saved from the indirect exit stub insert_jmp_to_ibl() */
/* Indirect jump through hashtable entry pointed to by XCX */
void
append_ibl_found(dcontext_t *dcontext, instrlist_t *ilist,
                 ibl_code_t *ibl_code, patch_list_t *patch,
                 uint start_pc_offset,
                 bool collision,
                 bool only_spill_state_in_tls, /* if true, no table info in TLS;
                                                * indirection off of XDI is used */
                 bool restore_eflags,
                 instr_t **fragment_found)
{
    bool absolute = !ibl_code->thread_shared_routine;
    bool target_prefix = true;
    /* eflags and xcx are restored in the target's prefix */
    /* if thread private routine */
    /*>>>    RESTORE_FROM_UPCONTEXT xbx_OFFSET,%xbx                  */
    /*>>>    jmp     *FRAGMENT_START_PC_OFFS(%xcx)                   */
    instr_t *inst = NULL;
    IF_X64(bool x86_to_x64_ibl_opt = (ibl_code->x86_to_x64_mode &&
                                      DYNAMO_OPTION(x86_to_x64_ibl_opt));)

    /* no support for absolute addresses on x64: we always use tls/reg */
    IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute));

    if (absolute) {
        inst = RESTORE_FROM_DC(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS);
    }

    if (!ibl_use_target_prefix(ibl_code)) {
        target_prefix = false;
        restore_eflags = true;
    }

#ifdef HASHTABLE_STATISTICS
    if (INTERNAL_OPTION(hashtable_ibl_stats) ||
        INTERNAL_OPTION(hashtable_ibl_entry_stats)) {
        if (!absolute && !only_spill_state_in_tls) {
            /* XDI holds app state, not a ptr to dcontext+<some offset> */
            APP(ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
        }
        append_increment_counter(dcontext, ilist, ibl_code, patch,
                                 REG_NULL,
                                 HASHLOOKUP_STAT_OFFS(hit),
                                 SCRATCH_REG1);
        if (collision) {
            append_increment_counter(dcontext, ilist, ibl_code, patch,
                                     REG_NULL,
                                     HASHLOOKUP_STAT_OFFS(collision_hit),
                                     SCRATCH_REG1);
        }
        if (INTERNAL_OPTION(hashtable_ibl_entry_stats)) {
            /* &lookup_table[i] - should allow access to &entry_stats[i] */
            append_increment_counter(dcontext, ilist, ibl_code, patch,
                                     SCRATCH_REG2,
                                     offsetof(fragment_stat_entry_t, hits),
                                     SCRATCH_REG1);
        }
        if (!absolute && !only_spill_state_in_tls)
            APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
    }
#endif /* HASHTABLE_STATISTICS */

#ifdef INTERNAL
    if (INTERNAL_OPTION(slowdown_ibl_found)) {
        /* add a loop here */
        append_empty_loop(dcontext, ilist, INTERNAL_OPTION(slowdown_ibl_found),
                          SCRATCH_REG1 /* dead */);
    }
#endif /* INTERNAL */

    if (restore_eflags) {
        insert_restore_eflags(dcontext, ilist, NULL, 0,
                              IBL_EFLAGS_IN_TLS(), absolute _IF_X64(x86_to_x64_ibl_opt));
    }
    if (!target_prefix) {
        /* We're going to clobber the xax slot */
        ASSERT(restore_eflags);
        /* For target_delete support with no prefix, since we're
         * clobbering all the registers here, we must save something;
         * We save the tag, rather than the table entry, to avoid an
         * extra load to get the tag in target_delete:
         *     <save    %xbx to xax slot>  # put tag in xax slot for target_delete
         */
        if (absolute) {
            APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, SCRATCH_REG0_OFFS));
        } else {
            APP(ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG1, DIRECT_STUB_SPILL_SLOT));
        }
    }
    if (IF_X64_ELSE(x86_to_x64_ibl_opt, false)) {
        APP(ilist, RESTORE_FROM_REG(dcontext, SCRATCH_REG1, REG_R10));
    } else if (absolute) {
        /* restore XBX through dcontext */
        APP(ilist, inst);
    } else {
        /* restore XBX through INDIRECT_STUB_SPILL_SLOT */
        APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, INDIRECT_STUB_SPILL_SLOT));
        DOCHECK(1, {
            if (!SHARED_IB_TARGETS())
                ASSERT(only_spill_state_in_tls);
        });
    }
    if (only_spill_state_in_tls) {
        /* If TLS doesn't hold table info, XDI was used for indirection.
         * Restore XDI through DCONTEXT_BASE_SPILL_SLOT */
        insert_shared_restore_dcontext_reg(dcontext, ilist, NULL);
    }

    if (target_prefix) {
        /* FIXME: do we want this?  seems to be a problem, I'm disabling:
         * ASSERT(!collision || start_pc_offset == FRAGMENT_START_PC_OFFS);  // c \imply FRAGMENT
         */
        APP(ilist, XINST_CREATE_jump_mem(dcontext,
                                         OPND_CREATE_MEMPTR(SCRATCH_REG2,
                                                            start_pc_offset)));
    } else {
        /* There is no prefix so we must restore all and jmp through memory:
         *     mov      start_pc_offset(%xcx), %xcx
         *     <save    %xcx to xbx slot>  # put target in xbx slot for later jmp
         *     <restore %xcx from xcx slot>
         *     jmp*     <xbx slot>
         */
        APP(ilist, XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG2),
                                     OPND_CREATE_MEMPTR(SCRATCH_REG2,
                                                        start_pc_offset)));
        if (absolute) {
#ifdef X86
            APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG2, SCRATCH_REG2_OFFS));
            if (IF_X64_ELSE(x86_to_x64_ibl_opt, false))
                APP(ilist, RESTORE_FROM_REG(dcontext, SCRATCH_REG2, REG_R9));
            else if (XCX_IN_TLS(0/*!FRAG_SHARED*/))
                APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG2,
                                            MANGLE_XCX_SPILL_SLOT));
            else
                APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG2, SCRATCH_REG2_OFFS));
            APP(ilist, XINST_CREATE_jump_mem(dcontext,
                                             OPND_DC_FIELD(absolute,
                                                           dcontext,
                                                           OPSZ_PTR,
                                                           SCRATCH_REG2_OFFS)));
#elif defined(ARM)
            /* FIXMED i#1551: NYI on ARM */
            ASSERT_NOT_REACHED();
#endif
        } else {
            APP(ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG2, INDIRECT_STUB_SPILL_SLOT));
#ifdef X64
            if (x86_to_x64_ibl_opt)
                APP(ilist, RESTORE_FROM_REG(dcontext, SCRATCH_REG2, REG_R9));
            else
#endif
                APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG2,
                                            MANGLE_XCX_SPILL_SLOT));
            APP(ilist,
                XINST_CREATE_jump_mem(dcontext,
                                      OPND_TLS_FIELD(INDIRECT_STUB_SPILL_SLOT)));
        }
    }

    if (fragment_found != NULL)
        *fragment_found = inst;
}

#ifdef PROFILE_LINKCOUNT
/* assumes linkstub in ebx, can handle NULL linkstub pointer
 * if clobber_eflags is true, assumes can kill eflags
 * else, ASSUMES CAN KILL ECX
 * both require a non null next instruction
 */
void
append_linkcount_incr(dcontext_t *dcontext, instrlist_t *ilist, bool clobber_eflags,
                      instr_t *next)
{
    /* PR 248210: unsupported feature on x64 */
    IF_X64(ASSERT_NOT_IMPLEMENTED(false));
    ASSERT(next != NULL);
    if (clobber_eflags) {
        /*>>>    testl   ebx,ebx */
        /*>>>    je      next:  */
        /* P4 opt guide says to use test to cmp reg with 0: shorter instr */
        APP(ilist, INSTR_CREATE_test(dcontext, opnd_create_reg(REG_EBX),
                                     opnd_create_reg(REG_EBX)));
        APP(ilist, INSTR_CREATE_jcc_short(dcontext, OP_je,
                                          opnd_create_instr(next)));
# ifdef LINKCOUNT_64_BITS
        /*>>>    addl    $1,count_offs(ebx == &linkstub)                 */
        /*>>>    adcl    $0,count_offs+4(ebx == &linkstub)               */
        APP(ilist, INSTR_CREATE_add(dcontext,
                                    OPND_CREATE_MEM32(REG_EBX, LINKSTUB_COUNT_OFFS),
                                    OPND_CREATE_INT8(1)));
        APP(ilist, INSTR_CREATE_adc(dcontext,
                                    OPND_CREATE_MEM32(REG_EBX, LINKSTUB_COUNT_OFFS+4),
                                    OPND_CREATE_INT8(0)));
# else
        /*>>>    incl    count_offs(ebx == &linkstub)                    */
        APP(ilist, INSTR_CREATE_inc(dcontext,
                                    OPND_CREATE_MEM32(REG_EBX, LINKSTUB_COUNT_OFFS)));
# endif
    } else {
# ifdef LINKCOUNT_64_BITS
        instr_t *carry;
# endif
        /*>>>    movl    %ebx, %ecx */
        /*>>>    jecxz   next:      */
        APP(ilist, XINST_CREATE_load(dcontext, opnd_create_reg(REG_ECX),
                                     opnd_create_reg(REG_EBX)));
        APP(ilist, INSTR_CREATE_jecxz(dcontext, opnd_create_instr(next)));
        /*>>>    movl    count_offs(%ebx == &linkstub), %ecx              */
        /*>>>    lea     1(%ecx), %ecx                                    */
        /*>>>    movl    %ecx, count_offs(%ebx == &linkstub), %ecx        */
        APP(ilist, XINST_CREATE_load(dcontext, opnd_create_reg(REG_ECX),
                                     OPND_CREATE_MEM32(REG_EBX, LINKSTUB_COUNT_OFFS)));
        APP(ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_ECX),
                            opnd_create_base_disp(REG_ECX, REG_NULL, 0, 1, OPSZ_lea)));
        APP(ilist, XINST_CREATE_store(dcontext,
                                      OPND_CREATE_MEM32(REG_EBX, LINKSTUB_COUNT_OFFS),
                                      opnd_create_reg(REG_ECX)));
# ifdef LINKCOUNT_64_BITS
        /*>>>    jecxz   carry                                            */
        /*>>>    jmp     nocarry                                          */
        /*>>>  carry:                                                     */
        /*>>>    movl    count_offs+4(%ebx == &linkstub), %ecx            */
        /*>>>    lea     1(%ecx), %ecx                                    */
        /*>>>    movl    %ecx, count_offs+4(%ebx == &linkstub), %ecx      */
        /*>>>  nocarry:                                                   */
        carry = XINST_CREATE_load(dcontext, opnd_create_reg(REG_ECX),
                                  OPND_CREATE_MEM32(REG_EBX, LINKSTUB_COUNT_OFFS));
        APP(ilist, INSTR_CREATE_jecxz(dcontext, opnd_create_instr(carry)));
        APP(ilist, XINST_CREATE_jump(dcontext, opnd_create_instr(next)));
        APP(ilist, carry);
        APP(ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_ECX),
                            opnd_create_base_disp(REG_ECX, REG_NULL, 0, 1, OPSZ_lea)));
        APP(ilist, XINST_CREATE_store(dcontext,
                                      OPND_CREATE_MEM32(REG_EBX, LINKSTUB_COUNT_OFFS),
                                      opnd_create_reg(REG_ECX)));
# endif
    }
}
#endif


static inline void
update_ibl_routine(dcontext_t *dcontext, ibl_code_t *ibl_code)
{
    if (!ibl_code->initialized)
        return;
    patch_emitted_code(dcontext, &ibl_code->ibl_patch,
                       ibl_code->indirect_branch_lookup_routine);
    DOLOG(2, LOG_EMIT, {
        const char *ibl_name;
        const char *ibl_brtype;
        ibl_name = get_ibl_routine_name(dcontext,
                                        ibl_code->indirect_branch_lookup_routine,
                                        &ibl_brtype);
        LOG(THREAD, LOG_EMIT, 2, "Just updated indirect branch lookup\n%s_%s:\n",
            ibl_name, ibl_brtype);
        disassemble_with_annotations(dcontext, &ibl_code->ibl_patch,
                                     ibl_code->indirect_branch_lookup_routine,
                                     ibl_code->indirect_branch_lookup_routine + ibl_code->ibl_routine_length);
    });

    if (ibl_code->ibl_head_is_inlined) {
        patch_emitted_code(dcontext, &ibl_code->ibl_stub_patch, ibl_code->inline_ibl_stub_template);
        DOLOG(2, LOG_EMIT, {
            const char *ibl_name;
            const char *ibl_brtype;
            ibl_name = get_ibl_routine_name(dcontext,
                                            ibl_code->indirect_branch_lookup_routine,
                                            &ibl_brtype);
            LOG(THREAD, LOG_EMIT, 2, "Just updated inlined stub indirect branch lookup\n%s_template_%s:\n",
                ibl_name, ibl_brtype);
            disassemble_with_annotations(dcontext, &ibl_code->ibl_stub_patch, ibl_code->inline_ibl_stub_template,
                                         ibl_code->inline_ibl_stub_template + ibl_code->inline_stub_length);
        });
    }
}

void
update_indirect_branch_lookup(dcontext_t *dcontext)
{
    generated_code_t *code = THREAD_GENCODE(dcontext);

    ibl_branch_type_t branch_type;
#ifdef X64
    ASSERT(is_shared_gencode(code));
    return; /* nothing to do: routines are all thread-shared */
#endif
    protect_generated_code(code, WRITABLE);
    for (branch_type = IBL_BRANCH_TYPE_START; branch_type < IBL_BRANCH_TYPE_END; branch_type++) {
        update_ibl_routine(dcontext, &code->bb_ibl[branch_type]);
        if (PRIVATE_TRACES_ENABLED() && !DYNAMO_OPTION(shared_trace_ibl_routine))
            update_ibl_routine(dcontext, &code->trace_ibl[branch_type]);
    }
#ifdef WINDOWS
    /* update mask and table in inlined ibl at end of syscall routine */
    if (DYNAMO_OPTION(shared_syscalls)) {
        patch_emitted_code(dcontext, &code->shared_syscall_code.ibl_patch,
                           code->unlinked_shared_syscall);
        DOLOG(2, LOG_EMIT, {
            LOG(THREAD, LOG_EMIT, 2, "Just updated shared syscall routine:\n");
            disassemble_with_annotations(dcontext,
                                         &code->shared_syscall_code.ibl_patch,
                                         code->unlinked_shared_syscall,
                                         code->end_shared_syscall);
        });
    }
#endif
    protect_generated_code(code, READONLY);
}

/* i#823: handle far cti transitions.  For now only handling known cs values
 * for WOW64 when using x64 DR, but we still use this far ibl so that in
 * the future we can add general cs change handling outside of the
 * fragment (which is much simpler: see below).
 *
 * One approach is to have the mode change happen in the fragment itself via
 * ind branch mangling.  But then we have the check for known cs there and
 * thus multiple exits some of which are 32-bit and some of which are 64-bit
 * which is messy.  Instead, we spill another reg, put the selector in it,
 * and jump to this ibl prefix routine.  One drawback is that by not doing
 * the mode transition in the fragment we give up on traces extending through
 * it and we must make a far cti a trace barrier.
 *
 *   fragment:
 *     spill xbx
 *     movzx selector -> xbx
 *     spill xcx
 *     mov target -> xcx
 *     jmp far_ibl
 *
 *   far_ibl:
 *     clear top 32 bits of xcx slot
 *     xchg xcx, xbx
 *     lea xcx -32_bit_cs -> xcx
 *     jecxz to_32
 *   64: (punting on handling cs o/w)
 *     xchg xcx, xbx
 *     restore xbx
 *     jmp 64-bit ibl
 *   to-32:
 *     dcontext -> ecx
 *     mov $1 -> x86_mode_offs(ecx)
 *     xchg xcx, xbx
 *     restore xbx
 *     far ind jmp through const mem that targets 32-bit ibl
 *
 * This is much simpler for state xl8: shouldn't need any added support.
 * For unlinking: have two versions of the gencode, so the unlink
 * is the standard fragment exit cti change only.
 *
 * For non-mixed-mode, we just jmp straight to ibl.  It's simpler to
 * generate and always go through this far_ibl though rather than
 * having interp up front figure out whether a mode change for direct
 * and then have far direct sometimes be direct and sometimes use
 * indirect faar-Ibl.
 *
 * For -x86_to_x64, we assume no 32-bit un-translated code entering here.
 *
 * FIXME i#865: for mixed-mode (including -x86_to_x64), far ibl must
 * preserve the app's r8-r15 during 32-bit execution.
 */
byte *
emit_far_ibl(dcontext_t *dcontext, byte *pc, ibl_code_t *ibl_code,
             cache_pc ibl_same_mode_tgt _IF_X64(far_ref_t *far_jmp_opnd))
{
    instrlist_t ilist;
    instrlist_init(&ilist);

#ifdef X64
    if (mixed_mode_enabled()) {
        instr_t *change_mode = INSTR_CREATE_label(dcontext);
        bool source_is_x86 = DYNAMO_OPTION(x86_to_x64) ? ibl_code->x86_to_x64_mode
                                                       : ibl_code->x86_mode;
        short selector = source_is_x86 ? CS64_SELECTOR : CS32_SELECTOR;

        /* all scratch space should be in TLS only */
        ASSERT(ibl_code->thread_shared_routine || DYNAMO_OPTION(private_ib_in_tls));

        if (ibl_code->x86_mode) {
            /* we're going to look up rcx in ibl table but we only saved the
             * bottom half so zero top half now
             */
            APP(&ilist, INSTR_CREATE_mov_imm
                (dcontext, opnd_create_tls_slot(os_tls_offset(MANGLE_XCX_SPILL_SLOT) + 4),
                 OPND_CREATE_INT32(0)));
        }

        APP(&ilist, INSTR_CREATE_xchg
            (dcontext, opnd_create_reg(SCRATCH_REG1), opnd_create_reg(SCRATCH_REG2)));
        /* segment is just 2 bytes but need addr prefix if don't have rex prefix */
        APP(&ilist, INSTR_CREATE_lea
            (dcontext, opnd_create_reg(SCRATCH_REG2),
             opnd_create_base_disp(SCRATCH_REG2, REG_NULL, 0, -selector, OPSZ_lea)));
        APP(&ilist, INSTR_CREATE_jecxz
            (dcontext, opnd_create_instr(change_mode)));

        APP(&ilist, INSTR_CREATE_xchg
            (dcontext, opnd_create_reg(SCRATCH_REG1), opnd_create_reg(SCRATCH_REG2)));
        if (ibl_code->x86_to_x64_mode && DYNAMO_OPTION(x86_to_x64_ibl_opt)) {
            APP(&ilist, XINST_CREATE_load
                (dcontext, opnd_create_reg(SCRATCH_REG1), opnd_create_reg(REG_R10)));
        } else {
            APP(&ilist,
                RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, MANGLE_FAR_SPILL_SLOT));
        }
        APP(&ilist, XINST_CREATE_jump
            (dcontext, opnd_create_pc(ibl_same_mode_tgt)));

        APP(&ilist, change_mode);
        APP(&ilist, instr_create_restore_from_tls
            (dcontext, SCRATCH_REG2, TLS_DCONTEXT_SLOT));
        /* FIXME: for SELFPROT_DCONTEXT we'll need to exit to dispatch every time
         * and add logic there to set x86_mode based on LINK_FAR.
         * We do not want x86_mode sitting in unprotected_context_t.
         */
        ASSERT_NOT_IMPLEMENTED(!TEST(SELFPROT_DCONTEXT, DYNAMO_OPTION(protect_mask)));
        APP(&ilist, XINST_CREATE_store
            (dcontext, OPND_CREATE_MEM8(SCRATCH_REG2,
                                        (int)offsetof(dcontext_t, isa_mode)),
             OPND_CREATE_INT8(source_is_x86 ? DR_ISA_AMD64 : DR_ISA_IA32)));
        APP(&ilist, INSTR_CREATE_xchg
            (dcontext, opnd_create_reg(SCRATCH_REG1), opnd_create_reg(SCRATCH_REG2)));
        if (ibl_code->x86_to_x64_mode && DYNAMO_OPTION(x86_to_x64_ibl_opt)) {
            APP(&ilist, XINST_CREATE_load
                (dcontext, opnd_create_reg(SCRATCH_REG1), opnd_create_reg(REG_R10)));
        } else {
            APP(&ilist,
                RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, MANGLE_FAR_SPILL_SLOT));
        }
        if (ibl_code->x86_mode) {
            /* FIXME i#865: restore 64-bit regs here */
        } else if (ibl_code->x86_to_x64_mode && DYNAMO_OPTION(x86_to_x64_ibl_opt)) {
            /* In the current mode, XCX is spilled into R9.
             * After mode switch, will use MANGLE_XCX_SPILL_SLOT for spilling XCX.
             */
            APP(&ilist, SAVE_TO_TLS(dcontext, REG_R9, MANGLE_XCX_SPILL_SLOT));
            /* FIXME i#865: restore 64-bit regs here */
        } else {
            /* FIXME i#865: save 64-bit regs here */
            /* In the current mode, XCX is spilled into MANGLE_XCX_SPILL_SLOT.
             * After mode switch, will use R9 for spilling XCX.
             */
            APP(&ilist, RESTORE_FROM_TLS(dcontext, REG_R9, MANGLE_XCX_SPILL_SLOT));
        }
        /* For now we assume we're WOW64 and thus in low 4GB.  For general mixed-mode
         * and reachability (xref i#774) we will need a trampoline in low 4GB.
         * Note that targeting the tail of the not-taken jecxz above doesn't help
         * b/c then that needs to be 32-bit reachable.
         */
        ASSERT(CHECK_TRUNCATE_TYPE_uint((ptr_uint_t)far_jmp_opnd));
        APP(&ilist, INSTR_CREATE_jmp_far_ind
            (dcontext, opnd_create_base_disp(REG_NULL, REG_NULL, 0,
                                             (uint)(ptr_uint_t)far_jmp_opnd, OPSZ_6)));
        /* For -x86_to_x64, we can disallow 32-bit fragments from having
         * indirect branches or far branches or system calls, and thus ibl
         * is always 64-bit.
         * Even if we allow 32-bit indirection, here we have to pick one
         * lookup method, and we'd go w/ the most common, which would assume
         * a 32-bit target has been translated: so even for a same-mode far
         * cti in a 32-bit (untranslated) fragment, we'd want to do a mode
         * change here.
         */
        /* caller will set target: we just set selector */
        far_jmp_opnd->selector = DYNAMO_OPTION(x86_to_x64) ? CS64_SELECTOR
                                                           : (ushort) selector;

        if (ibl_code->x86_mode) {
            instrlist_convert_to_x86(&ilist);
        }
    } else {
#endif
        /* We didn't spill or store into xbx when mangling so just jmp to ibl.
         * Note that originally I had the existence of far_ibl, and LINK_FAR,
         * as X64 only, and only emitted far_ibl for mixed-mode.  But given that
         * it's simpler to have far direct as indirect all the time, I decided
         * to also go through a far ibl all the time.  Eventually to fully
         * handle any cs change we'll want it this way.
         *
         * XXX i#823: store cs into xbx when mangling, and then do cs
         * change here.
         */
        APP(&ilist, XINST_CREATE_jump
            (dcontext, opnd_create_pc(ibl_same_mode_tgt)));
#ifdef X64
    }
#endif

    pc = instrlist_encode(dcontext, &ilist, pc, true/*instr targets*/);
    ASSERT(pc != NULL);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}

#ifdef X86
static instr_t *
create_int_syscall_instr(dcontext_t *dcontext)
{
#ifdef WINDOWS
    /* On windows should already be initialized by syscalls_init() */
    ASSERT(get_syscall_method() != SYSCALL_METHOD_UNINITIALIZED);
    /* int $0x2e */
    if (DYNAMO_OPTION(sygate_int)) {
        /* ref case 5217, we call to an existing int in NtYieldExecution
         * to avoid tripping up Sygate. */
        return INSTR_CREATE_call(dcontext, opnd_create_pc(int_syscall_address));
    } else {
        return INSTR_CREATE_int(dcontext, opnd_create_immed_int((char)0x2e, OPSZ_1));
    }
#else
    /* if uninitialized just guess int, we'll patch up later */

    return INSTR_CREATE_int(dcontext, opnd_create_immed_int((char)0x80, OPSZ_1));
#endif
}
#endif

instr_t *
create_syscall_instr(dcontext_t *dcontext)
{
    int method = get_syscall_method();
#ifdef ARM
    if (method == SYSCALL_METHOD_SVC) {
        return INSTR_CREATE_svc(dcontext, opnd_create_immed_int((char)0x0, OPSZ_1));
    }
# elif defined(X86)
    if (method == SYSCALL_METHOD_INT || method == SYSCALL_METHOD_UNINITIALIZED) {
        return create_int_syscall_instr(dcontext);
    } else if (method == SYSCALL_METHOD_SYSENTER) {
        return INSTR_CREATE_sysenter(dcontext);
    } else if (method == SYSCALL_METHOD_SYSCALL) {
        return INSTR_CREATE_syscall(dcontext);
    }

# ifdef WINDOWS
    else if (method == SYSCALL_METHOD_WOW64) {
        /* call *fs:0xc0 */
        return INSTR_CREATE_call_ind(dcontext,
                                     opnd_create_far_base_disp(SEG_FS, REG_NULL,
                                                               REG_NULL, 0,
                                                               WOW64_TIB_OFFSET,
                                                               OPSZ_4_short2));
    }
# endif
#endif /* ARM/X86 */
    else {
        ASSERT_NOT_REACHED();
        return NULL;
    }
}

#ifdef WINDOWS

/* Insert instructions after the syscall instruction (e.g., sysenter) to
 * restore the next tag target from dcontext XSI slot to %xcx register
 * for continue execution.
 * See the comment below for emit_shared_syscall about shared syscall
 * handling.
 */
static void
insert_restore_target_from_dc(dcontext_t *dcontext,
                              instrlist_t *ilist,
                              bool all_shared)
{
    ASSERT(IF_X64_ELSE(all_shared, true)); /* PR 244737 */
    if (all_shared) {
        APP(ilist,
            instr_create_restore_from_dc_via_reg
            (dcontext, REG_NULL/*default*/, SCRATCH_REG2, SCRATCH_REG4_OFFS));
    } else {
        APP(ilist,
            instr_create_restore_from_dcontext(dcontext, SCRATCH_REG2, SCRATCH_REG4_OFFS));
    }
# ifdef CLIENT_INTERFACE
    /* i#537: we push KiFastSystemCallRet on to the stack and adjust the
     * next code to be executed at KiFastSystemCallRet.
     */
    if (get_syscall_method() == SYSCALL_METHOD_SYSENTER &&
        KiFastSystemCallRet_address != NULL) {
        /* push adjusted ecx onto stack */
        APP(ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(SCRATCH_REG2)));
        APP(ilist, INSTR_CREATE_mov_imm(dcontext,
                                        opnd_create_reg(SCRATCH_REG2),
                                        OPND_CREATE_INT32
                                        (KiFastSystemCallRet_address)));
    }
# endif /* CLIENT_INTERFACE */
}

/* All system call instructions turn into a jump to an exit stub that
 * jumps here, with the xsi slot in dcontext (or the mangle-next-tag tls slot
 * for -shared_fragment_shared_syscalls) containing the return address
 * after the original system call instr, and xbx containing the linkstub ptr.
 *
 * Unlinked version of shared_syscall is needed, even though syscalls are
 * not part of traces (we unlink for other reasons, like flushing or
 * in-trace replacement).
 * To make unlinked entry point, have to make completely separate routine
 * that calls unlinked_ibl instead of indirect_branch_lookup, or else
 * common linked case needs an extra conditional.  I chose the latter
 * approach.  I figure an extra load and jecxz won't be noticeable.
 * Another reason is that this approach means there is a single system
 * call instruction to check for suspended threads at, instead of two.
 * To make the jecxz match forward-not-taken I actually add another store
 * on the linked path.
 * FIXME: is this a perf hit that makes it worth the code complexity
 * of two syscall routines?
 * FIXME: The 'target_trace_table' indicates whether the trace or BB IBT
 * table should be targetted. If BB2BB IBL is used (when trace building is
 * not disabled), then both traces and BBs use the same shared syscall.
 * (We emit only one.) So we can't target the BB table since that would
 * result in missed opportunities to mark secondary trace heads (trace->BB
 * IB transitions after shared syscall). So for BB2BB IBL this could be
 * a perf hit, but not a regression compared to not using BB2BB IBL. More
 * comments below in the routine.
 *
_unlinked_shared_syscall:
        SAVE_TO_UPCONTEXT $0,xax_OFFSET # flag: use unlinked ibl; xcx tls if all_shared
        jmp skip_linked
_shared_syscall:
        SAVE_TO_UPCONTEXT $1,xax_OFFSET # flag: use regular ibl; xcx tls if all_shared
skip_linked:
        .ifdef SIDELINE
        # clear cur-trace field so we don't think cur trace is still running
        mov     $0, _sideline_trace
        .endif

        .if all_shared
        SAVE_TO_TLS xdi, xdi_offset
        RESTORE_FROM_TLS xdi, dcontext_offset
        .endif

        .if !all_shared && DYNAMO_OPTION(shared_fragment_shared_syscalls)
          .if !sysenter_syscall_method
              LOAD_FROM_TLS MANGLE_NEXT_TAG_SLOT,%xdi
              SAVE_TO_UPCONTEXT %xdi,xsi_OFFSET
          .endif
        RESTORE_FROM_TLS xdi_OFFSET
        .endif

        # make registers have app values for interrupt
        .if !INTERNAL_OPTION(shared_syscalls_fastpath)
        SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET # save linkstub ptr
           .if all_shared
           # get next_tag (from xcx tls slot) into upcontext, for callback dcontext swap
           RESTORE_FROM_TLS xbx, mangle_next_tag_slot
           SAVE_TO_UPCONTEXT xbx, xsi_OFFSET
           .endif
           # %xbx is stored in TLS if shared fragments can target shared syscall
           .if DYNAMO_OPTION(shared_fragment_shared_syscalls)
           LOAD_FROM_TLS INDIRECT_STUB_SPILL_SLOT,%xbx # restore app's xbx
           .else
           RESTORE_FROM_UPCONTEXT xbx_OFFSET,%xbx # restore app's xbx
           .endif
        .endif

        .if sysenter_syscall_method
        pop     xsi_OFFSET
        push    <after-syscall-address>
        .endif

        # even if !DYNAMO_OPTION(syscalls_synch_flush) must set for reset
        movl 1, at_syscall_OFFSET # indicate to flusher we're in a syscall

        .if all_shared
        SAVE_TO_UPCONTEXT  xdi, xdi_offset
        RESTORE_FROM_TLS xdi, xdi_offset
        .endif

        # system call itself
        int     $0x2e
        # kernel may decide to run a callback here...but when we come
        #   back we can't tell the difference

        .if all_shared
        RESTORE_FROM_TLS xdi, dcontext_offset
        .endif

        # even if !DYNAMO_OPTION(syscalls_synch_flush) must clear for cbret
        movl 0, at_syscall_OFFSET # indicate to flusher/dispatch we're done w/ syscall

        # assume interrupt could have changed register values
        .if !inline_ibl_head # else, saved inside inlined ibl
           # for shared_fragment_shared_syscalls = true, absolute != true
           .if !DYNAMO_OPTION(shared_fragment_shared_syscalls)
           SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
           .endif
           .if !absolute
           SAVE_TO_TLS %xbx,INDIRECT_STUB_SPILL_SLOT
           .endif
           .if !INTERNAL_OPTION(shared_syscalls_fastpath)
           RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xbx # bring back linkstub ptr
           .endif
        .endif

        # now set up for indirect_branch_lookup
        .if !DYNAMO_OPTION(shared_fragment_shared_syscalls)
        SAVE_TO_UPCONTEXT %xcx,xcx_OFFSET
        .endif
        .if !absolute && !all_shared
        SAVE_TO_TLS %xcx,MANGLE_XCX_SPILL_SLOT
        .endif

        .if all_shared
        xchg  xcx-tls, xcx # get link/unlink flag, and save app xcx, at once
          .if x64
           mov   ecx,ecx # clear top 32 bits of flag
           .endif
        .else
        RESTORE_FROM_UPCONTEXT xax_OFFSET,%xcx # get link/unlink flag
        .endif

        # patch point: jecxz -> jmp for shared_syscall unlink
        jecxz unlink

        .if INTERNAL_OPTION(shared_syscalls_fastpath)
        mov     shared-syscalls-bb-linkstub, %xbx # set linkstub ptr
           .if inline_ibl_head
           SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET # save linkstub ptr
           .endif
        .endif

        # linked code
        RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xcx # bring back return address
        .if !inline_ibl_head
        jmp     _indirect_branch_lookup
        .else
        # inline ibl lookup head here! (don't need unlink/miss, already did
        #   that work, miss goes straight to ibl routine)
        .endif

unlink:
        # unlinked code
        RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xcx # bring back return address
        .if !inline_ibl_head
        mov  @shared_syscall_unlinked_linkstub,%xbx
        .else
           .if absolute
           SAVE_TO_UPCONTEXT @shared_syscall_unlinked_linkstub,xdi_OFFSET
           .else
           SAVE_TO_TLS @shared_syscall_unlinked_linkstub,INDIRECT_STUB_SPILL_SLOT
           .endif
           .if !DYNAMO_OPTION(atomic_inlined_linking)
           SAVE_TO_UPCONTEXT %xcx,xbx_offset
           movb  $0x1, %cl
           .else
           SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET # could have changed in kernel
           .endif
        .endif

        jmp     _unlinked_ib_lookup
 */
byte *
emit_shared_syscall(dcontext_t *dcontext, generated_code_t *code, byte *pc,
                    ibl_code_t *ibl_code,
                    patch_list_t *patch,
                    byte *ind_br_lookup_pc, byte *unlinked_ib_lookup_pc,
                    bool target_trace_table,
                    bool inline_ibl_head,
                    bool thread_shared,
                    byte **shared_syscall_pc)
{
    instrlist_t ilist;
    byte *start_pc = pc;
    instr_t *syscall; /* remember after-syscall pc b/c often suspended there */
    /* relative labels */
    instr_t *linked, *jecxz, *unlink, *skip_syscall = NULL;
    bool absolute = !thread_shared;
    uint after_syscall_ptr = 0;
    uint syscall_method = get_syscall_method();
    instr_t *adjust_tos;
    /* thread_shared indicates whether ibl is thread-shared: this bool indicates
     * whether this routine itself is all thread-shared */
    bool all_shared = IF_X64_ELSE(true, false); /* PR 244737 */
    IF_X64(bool x86_to_x64_ibl_opt = ibl_code->x86_to_x64_mode &&
        DYNAMO_OPTION(x86_to_x64_ibl_opt);)

    /* no support for absolute addresses on x64: we always use tls */
    IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute));
    /* x64 always shares shared_syscall fragments */
    IF_X64(ASSERT_NOT_IMPLEMENTED(DYNAMO_OPTION(shared_fragment_shared_syscalls)));
    /* PR 248207: haven't updated the inlining to be x64-compliant yet */
    IF_X64(ASSERT_NOT_IMPLEMENTED(!inline_ibl_head));

    /* i#821/PR 284029: for now we assume there are no syscalls in x86 code.
     * To support them we need to update this routine, emit_do_syscall*,
     * and emit_detach_callback_code().
     */
    IF_X64(ASSERT_NOT_IMPLEMENTED(!ibl_code->x86_mode));

    /* ibl_code was not initialized by caller */
    ibl_code->thread_shared_routine = thread_shared;
    ibl_code->branch_type = IBL_SHARED_SYSCALL;

    /* initialize the ilist */
    instrlist_init(&ilist);
    init_patch_list(patch, absolute ? PATCH_TYPE_ABSOLUTE : PATCH_TYPE_INDIRECT_XDI);
    /* We should generate some thread-shared code when
     * shared_fragment_shared_syscalls=true. */
    DOCHECK(1, {
        if (DYNAMO_OPTION(shared_fragment_shared_syscalls))
            ASSERT(!absolute);
    });
    LOG(THREAD, LOG_EMIT, 3,
        "emit_shared_syscall: pc="PFX" patch="PFX
        " inline_ibl_head=%d thread shared=%d\n",
        pc, patch, inline_ibl_head, thread_shared);

    /* FIXME: could save space by storing a single byte, and using movzx into ecx
     * below before the jecxz
     */
    if (all_shared) {
        /* xax and xbx tls slots are taken so we use xcx */
# ifdef X64
        if (x86_to_x64_ibl_opt) {
            linked = INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_R9D),
                                          OPND_CREATE_INT32(1));
        } else {
# endif
            linked = XINST_CREATE_store(dcontext,
                                        OPND_TLS_FIELD_SZ(MANGLE_XCX_SPILL_SLOT, OPSZ_4),
                                        OPND_CREATE_INT32(1));
# ifdef X64
        }
# endif
    } else
        linked = instr_create_save_immed_to_dcontext(dcontext, 1, SCRATCH_REG0_OFFS);
    APP(&ilist, linked);
    add_patch_marker(patch, instrlist_first(&ilist), PATCH_ASSEMBLE_ABSOLUTE,
                     0 /* beginning of instruction */, (ptr_uint_t*)shared_syscall_pc);

# ifdef SIDELINE
    if (dynamo_options.sideline) {
        /* clear cur-trace field so we don't think cur trace is still running */
        APP(&ilist, XINST_CREATE_store
            (dcontext, OPND_CREATE_ABSMEM((void *)&sideline_trace, OPSZ_4),
             OPND_CREATE_INT32(0)));
    }
# endif

    if (all_shared) {
        /* load %xdi w/ dcontext */
        insert_shared_get_dcontext(dcontext, &ilist, NULL, true/*save xdi*/);
    }

    /* for all-shared we move next tag from tls down below once xbx is dead */
    if (!all_shared && DYNAMO_OPTION(shared_fragment_shared_syscalls)) {
        if (syscall_method != SYSCALL_METHOD_SYSENTER) {
            /* Move the next tag field from TLS into the proper slot. */
            APP(&ilist,
                XINST_CREATE_load
                (dcontext, opnd_create_reg(SCRATCH_REG5),
                 opnd_create_tls_slot(os_tls_offset(MANGLE_NEXT_TAG_SLOT))));
            APP(&ilist,
                instr_create_save_to_dcontext(dcontext, SCRATCH_REG5, SCRATCH_REG4_OFFS));
        }
        /* restore app %xdi */
        insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
    }

    /* put linkstub ptr in slot such that when inlined it will be
     * in the right place in case of a miss */
    if (!INTERNAL_OPTION(shared_syscalls_fastpath) && DYNAMO_OPTION(indirect_stubs)) {
        /* even if inline_ibl_head and !absolute, we must put into mcontext
         * here since tls is not saved on callback stack
         */
        if (all_shared) {
            APP(&ilist,
                instr_create_save_to_dc_via_reg(dcontext, REG_NULL/*default*/,
                                                SCRATCH_REG1, SCRATCH_REG5_OFFS));
        } else {
            APP(&ilist,
                instr_create_save_to_dcontext(dcontext, SCRATCH_REG1, SCRATCH_REG5_OFFS));
        }
    } else {
        /* FIXME: for -no_indirect_stubs, we need our own complete ibl
         * here in order to use our own linkstub_t.  For now we just use
         * a trace jmp* linkstub_t from the ibl we target, making every
         * post-non-ignorable-syscall fragment a trace head.
         */
    }

    if (all_shared) {
        /* move next_tag from tls into dcontext, for callback dcontext swap,
         * using dead xbx */
        if (!DYNAMO_OPTION(indirect_stubs)) {
            /* xbx isn't dead */
            APP(&ilist, instr_create_save_to_tls
                (dcontext, SCRATCH_REG1, INDIRECT_STUB_SPILL_SLOT));
        }
        APP(&ilist,
            instr_create_restore_from_tls(dcontext, SCRATCH_REG1, MANGLE_NEXT_TAG_SLOT));
        APP(&ilist,
            instr_create_save_to_dc_via_reg(dcontext, REG_NULL/*default*/,
                                            SCRATCH_REG1, SCRATCH_REG4_OFFS));
        if (!DYNAMO_OPTION(indirect_stubs)) {
            /* restore xbx */
            APP(&ilist, instr_create_restore_from_tls
                (dcontext, SCRATCH_REG1, INDIRECT_STUB_SPILL_SLOT));
        }
    }

    /* make registers have app values for the interrupt */
    /* restore app's xbx (if we went through a stub to get here) */
    if (!INTERNAL_OPTION(shared_syscalls_fastpath) && DYNAMO_OPTION(indirect_stubs)) {
        if (DYNAMO_OPTION(shared_fragment_shared_syscalls)) {
            APP(&ilist,
                XINST_CREATE_load
                (dcontext, opnd_create_reg(SCRATCH_REG1),
                 opnd_create_tls_slot(os_tls_offset(INDIRECT_STUB_SPILL_SLOT))));
        }
        else {
            APP(&ilist,
                instr_create_restore_from_dcontext(dcontext,
                                                   SCRATCH_REG1,
                                                   SCRATCH_REG1_OFFS));
        }
    }
    if (syscall_method == SYSCALL_METHOD_SYSENTER) {
        /* PR 248210: not bothering to make x64-ready: if we do, be sure to pop into
         * next-tag tls */
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        /* For sysenter, mangle pushed the next tag onto the stack,
         * so we pop it into the xsi slot and push the [to-be-patched]
         * after-syscall address.
         */
        /* We have to save xsp in case a callback is delivered and we later detach
         * (since detach expects the callback dcontext xsp to be correct). xref 9889 */
        APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_XSP, XSP_OFFSET));
        APP(&ilist,
            INSTR_CREATE_pop(dcontext,
                             opnd_create_dcontext_field(dcontext, SCRATCH_REG4_OFFS)));
        adjust_tos = INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT32(0));
        APP(&ilist, adjust_tos);
        add_patch_marker(patch, adjust_tos, PATCH_ASSEMBLE_ABSOLUTE,
                         1 /* offset of imm field */,
                         (ptr_uint_t *)&after_syscall_ptr);
    }
    /* even if !DYNAMO_OPTION(syscalls_synch_flush) must set for reset */
    ASSERT(!TEST(SELFPROT_DCONTEXT, DYNAMO_OPTION(protect_mask)));
    if (all_shared) {
        /* readers of at_syscall are ok w/ us not quite having xdi restored yet */
        APP(&ilist, XINST_CREATE_store
            (dcontext,
             opnd_create_dcontext_field_via_reg_sz(dcontext, REG_NULL/*default*/,
                                                   AT_SYSCALL_OFFSET, OPSZ_4),
             OPND_CREATE_INT32(1)));
        /* restore app %xdi */
        insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
    } else
        APP(&ilist, instr_create_save_immed_to_dcontext(dcontext, 1, AT_SYSCALL_OFFSET));

    if (DYNAMO_OPTION(sygate_sysenter) &&
        get_syscall_method() == SYSCALL_METHOD_SYSENTER) {
        /* PR 248210: not bothering to make x64-ready */
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        /* case 5441 hack - set up stack so first return address points to ntdll
         * Won't worry about arithmetic eflags since no one should care about
         * those at a syscall, will preserve other regs though. */
        /* FIXME - what is the perf impact of these extra 5 instructions, we can
         * prob. do better. */
        /* note we assume xsp == xdx (if doesn't we already have prob. ref
         * case 5461) */
        /* current state
         *     xsi_slot = next_pc
         *     xsp -> after_shared_syscall
         *      +4 -> app value1
         * desired state
         *     sysenter_storage_slot = app_value1
         *     xsp -> sysenter_ret_address (ntdll ret)
         *      +4 -> after_shared_syscall
         */
        /* NOTE - the stack mangling must match that of handle_system_call()
         * and intercept_nt_continue() as not all routines looking at the stack
         * differentiate. */
        /* pop stack leaving old value (after_shared_syscall) in place */
        APP(&ilist, INSTR_CREATE_add(dcontext, opnd_create_reg(REG_XSP),
                                     OPND_CREATE_INT8(4)));
        APP(&ilist,
            INSTR_CREATE_pop(dcontext,
                             opnd_create_dcontext_field(dcontext,
                                                        SYSENTER_STORAGE_OFFSET)));
        /* instead of pulling in the existing stack value we could just patch in
         * the after syscall imm */
        /* see intel docs, source calculated before xsp dec'ed so we're pushing two
         * stack slots up into the next slot up */
        APP(&ilist, INSTR_CREATE_push(dcontext, OPND_CREATE_MEM32(REG_XSP, -8)));
        APP(&ilist, INSTR_CREATE_push_imm
            (dcontext, OPND_CREATE_INTPTR((ptr_int_t)sysenter_ret_address)));
    }

    /* syscall itself */
    APP(&ilist, create_syscall_instr(dcontext));
    syscall = instrlist_last(&ilist);

    if (DYNAMO_OPTION(sygate_sysenter) &&
        get_syscall_method() == SYSCALL_METHOD_SYSENTER) {
        /* PR 248210: not bothering to make x64-ready */
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        /* case 5441 hack - we popped an extra stack slot, need to fill with saved
         * app value */
        APP(&ilist,
            INSTR_CREATE_push(dcontext,
                              opnd_create_dcontext_field(dcontext,
                                                         SYSENTER_STORAGE_OFFSET)));
    }

    /* Now that all instructions from the linked entry point up to and
     * including the syscall have been added, prepend the unlinked path
     * instructions. We wait until the syscall has been added because when
     * shared_syscalls_fastpath = true and "int 2e" syscalls are used, the
     * target of the unlinked path's jmp is the syscall itself.
     */
    /* these two in reverse order since prepended */
    instrlist_prepend(&ilist, XINST_CREATE_jump
                      (dcontext, opnd_create_instr(instr_get_next(linked))));
    if (all_shared) {
        /* xax and xbx tls slots are taken so we use xcx */
# ifdef X64
        if (x86_to_x64_ibl_opt) {
            instrlist_prepend(&ilist, INSTR_CREATE_mov_imm
                              (dcontext, opnd_create_reg(REG_R9D), OPND_CREATE_INT32(0)));
        } else {
# endif
            instrlist_prepend(&ilist, XINST_CREATE_store
                              (dcontext,
                               /* simpler to do 4 bytes even on x64 */
                               OPND_TLS_FIELD_SZ(MANGLE_XCX_SPILL_SLOT, OPSZ_4),
                               OPND_CREATE_INT32(0)));
# ifdef X64
        }
# endif
    } else {
        instrlist_prepend(&ilist,
                          instr_create_save_immed_to_dcontext(dcontext, 0, SCRATCH_REG0_OFFS));
    }

    /* even if !DYNAMO_OPTION(syscalls_synch_flush) must clear for cbret */
    if (all_shared) {
        /* readers of at_syscall are ok w/ us spilling xdi first */
        insert_shared_get_dcontext(dcontext, &ilist, NULL, true/*save xdi*/);
        APP(&ilist, XINST_CREATE_store
            (dcontext,
             opnd_create_dcontext_field_via_reg_sz(dcontext, REG_NULL/*default*/,
                                                   AT_SYSCALL_OFFSET, OPSZ_4),
             OPND_CREATE_INT32(0)));
    } else
        APP(&ilist, instr_create_save_immed_to_dcontext(dcontext, 0, AT_SYSCALL_OFFSET));

    if (!inline_ibl_head && DYNAMO_OPTION(indirect_stubs)) {
        /* FIXME Can we remove the write to the mcontext for the !absolute
         * case? Initial tests w/notepad crashed when doing so -- we should
         * look deeper.
         */
        /* save app's xbx (assume interrupt could have changed it) */
        /* Remember, shared_fragment_shared_syscalls=true means absolute=false,
         * so for shared_fragment_shared_syscalls=true %xbx is saved in
         * the !absolute "if" that follows.
         */
        if (!DYNAMO_OPTION(shared_fragment_shared_syscalls)) {
            APP(&ilist,
                instr_create_save_to_dcontext(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS));
        }
        if (!absolute) {
            /* save xbx in TLS so that downstream code can find it */
            APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG1, INDIRECT_STUB_SPILL_SLOT));
        }
        if (!INTERNAL_OPTION(shared_syscalls_fastpath)) {
            if (all_shared) {
                APP(&ilist,
                    instr_create_restore_from_dc_via_reg(dcontext, REG_NULL/*default*/,
                                                         SCRATCH_REG1, SCRATCH_REG5_OFFS));
            } else {
                APP(&ilist,
                    instr_create_restore_from_dcontext(dcontext, SCRATCH_REG1, SCRATCH_REG5_OFFS));
            }
        }
    } /* if inlined, xbx will be saved inside inlined ibl; if no indirect stubs,
       * xbx will be saved in the ibl routine, or not at all if unlinked
       */

    /* set up for indirect_branch_lookup */
    /* save app's xcx */
    if (!DYNAMO_OPTION(shared_fragment_shared_syscalls))
        APP(&ilist, instr_create_save_to_dcontext(dcontext, SCRATCH_REG2, SCRATCH_REG2_OFFS));
    /* FIXME Can we remove the write to the mcontext for the !absolute
     * case, as suggested above? */
    if (!absolute && !all_shared/*done later*/) {
        /* save xcx in TLS */
# ifdef X64
        if (x86_to_x64_ibl_opt)
            APP(&ilist, SAVE_TO_REG(dcontext, SCRATCH_REG2, REG_R9));
        else
# endif
            APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG2, MANGLE_XCX_SPILL_SLOT));
    }

    if (!INTERNAL_OPTION(shared_syscalls_fastpath)) {
        if (inline_ibl_head && DYNAMO_OPTION(indirect_stubs)) {
            /* Need to move linkstub ptr from mcontext->xdi into tls.
             * We couldn't put it directly there pre-syscall b/c tls
             * is not saved on callback stack!
             * We do this now to take advantage of xcx being dead.
             */
            APP(&ilist,
                instr_create_restore_from_dcontext(dcontext, SCRATCH_REG2, SCRATCH_REG5_OFFS));
            APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG2, TLS_SLOT_REG3));
        }
    }

    /* get link flag */
    unlink = INSTR_CREATE_label(dcontext);
    if (all_shared) {
        /* we stored 4 bytes so get 4 bytes back; save app xcx at same time */
# ifdef X64
        if (x86_to_x64_ibl_opt) {
            APP(&ilist, INSTR_CREATE_xchg
                (dcontext, opnd_create_reg(REG_R9), opnd_create_reg(SCRATCH_REG2)));
        } else {
# endif
            APP(&ilist, INSTR_CREATE_xchg
                (dcontext, OPND_TLS_FIELD(MANGLE_XCX_SPILL_SLOT), opnd_create_reg(SCRATCH_REG2)));
# ifdef X64
        }
        /* clear top 32 bits */
        APP(&ilist, XINST_CREATE_store
            (dcontext, opnd_create_reg(REG_ECX), opnd_create_reg(REG_ECX)));
# endif
        /* app xdi is restored later after we've restored next_tag from xsi slot */
    } else {
        APP(&ilist, instr_create_restore_from_dcontext(dcontext, SCRATCH_REG2, SCRATCH_REG0_OFFS));
    }
    jecxz = INSTR_CREATE_jecxz(dcontext, opnd_create_instr(unlink));
    APP(&ilist, jecxz);
    /* put linkstub ptr in xbx */
    if (INTERNAL_OPTION(shared_syscalls_fastpath) && DYNAMO_OPTION(indirect_stubs)) {
        APP(&ilist, INSTR_CREATE_mov_imm
            (dcontext, opnd_create_reg(SCRATCH_REG1),
             OPND_CREATE_INTPTR((ptr_int_t)get_shared_syscalls_bb_linkstub())));
        /* put linkstub ptr in slot such that when inlined it will be
         * in the right place in case of a miss */
        if (inline_ibl_head) {
            if (absolute) {
                APP(&ilist,
                    instr_create_save_to_dcontext(dcontext, SCRATCH_REG1, SCRATCH_REG5_OFFS));
            } else {
                APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG1, TLS_SLOT_REG3));
            }
        }
    } /* else case is up above to use dead xcx reg */

    /* Add a patch marker once we know that there's an instr in the ilist
     * after the syscall. */
    add_patch_marker(patch, instr_get_next(syscall) /* take addr of next instr */,
                     PATCH_UINT_SIZED /* pc relative */,
                     0 /* beginning of instruction */,
                     (ptr_uint_t*)&code->sys_syscall_offs);
    add_patch_marker(patch, jecxz, PATCH_UINT_SIZED /* pc relative */,
                     0 /* point at opcode of jecxz */,
                     (ptr_uint_t*)&code->sys_unlink_offs);

    /* put return address in xcx (was put in xsi slot by mangle.c, or in tls
     * by mangle.c and into xsi slot before syscall for all_shared) */
    /* we duplicate the restore from dc and restore of xdi on the link
     * and unlink paths, rather than putting next_tag back into tls here
     * (can't rely on that tls slot persisting over syscall w/ callbacks)
     */
    insert_restore_target_from_dc(dcontext, &ilist, all_shared);
    if (all_shared) {
        /* restore app %xdi */
        insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
    }

    /* FIXME As noted in the routine's header comments, shared syscall targets
     * the trace [IBT] table when both traces and BBs could be using it (when
     * trace building is not disabled). Ideally, we want traces to target the
     * trace table and BBs to target the BB table (when BB2BB IBL is on, that is).
     * Since the BB IBT table usually holds non-trace head BBs as well as traces
     * (including traces is option controlled), using it will doubtless lead to
     * higher IBL hit rate, though it's unclear if there would be a visible
     * impact on performance. Since BBs and traces use different fake linkstubs
     * when executing thru shared syscall, we can detect what the last fragment
     * was and conditionally jump to the ideal IBL routine.
     *
     * Since the EFLAGS at this point hold app state, we'd need to save/restore
     * them prior to executing the IBL code if we used a 'cmp' followed by cond.
     * branch. Or we could save the EFLAGS and jump to a new entry point in the
     * IBL, one just after the 'seto'. (We'd have to move any load of %xdi
     * with the dcontext to just below the 'seto'.)
     *
     * We could avoid conditional code altogether if both inline_trace_ibl
     * and inline_bb_ibl are false. Instead of passing fake linkstub addresses
     * from a fragment exit stub through shared syscall, we could pass the
     * address of the IBL routine to jump to -- BB IBL for BBs and trace IBL
     * for traces. Shared syscall would do an indirect jump to reach the proper
     * routine. On an IBL miss, the address is passed through to dispatch, which
     * can convert the address into the appropriate fake linkstub address (check
     * if the address is within emitted code and equals either BB or trace IBL.)
     * Since an address is being passed around and saved to the dcontext during
     * syscalls, some of which could be relatively long, this is a security
     * hole.
     */
    if (!inline_ibl_head) {
        APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(ind_br_lookup_pc)));
    } else {
        append_ibl_head(dcontext, &ilist, ibl_code, patch, NULL, NULL, NULL,
                        opnd_create_pc(ind_br_lookup_pc),
                        false/*miss cannot have 8-bit offs*/,
                        target_trace_table,
                        inline_ibl_head);
    }

    /* unlink path (there can be no fall-through) */
    APP(&ilist, unlink);
    /* we duplicate the restore from dc and restore of xdi on the link
     * and unlink paths: see note above */
    insert_restore_target_from_dc(dcontext, &ilist, all_shared);
    if (all_shared) {
        /* restore app %xdi */
        insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
    }
    /* When traversing the unlinked entry path, since IBL is bypassed
     * control reaches dispatch, and the target is (usually) added to the IBT
     * table. But since the unlinked path was used, the target may already be
     * present in the table so the add attempt is unnecessary and triggers an
     * ASSERT in fragment_add_ibl_target().
     *
     * The add attempt is bypassed by moving an unlinked linkstub ptr into the
     * correct place -- for inlined IBL, the %xdi slot, otherwise, %xbx. This will
     * identify exits from the unlinked path. The stub's flags are set to 0
     * to bypass the add IBL target attempt.
     */
    if (!inline_ibl_head) {
        if (DYNAMO_OPTION(indirect_stubs)) {
            APP(&ilist, INSTR_CREATE_mov_imm
                (dcontext, opnd_create_reg(SCRATCH_REG1),
                 OPND_CREATE_INTPTR((ptr_int_t)get_shared_syscalls_unlinked_linkstub())));
        }
    }
    else {
        if (absolute) {
            APP(&ilist, instr_create_save_immed_to_dcontext
                (dcontext, (int)(ptr_int_t)get_shared_syscalls_unlinked_linkstub(),
                 SCRATCH_REG5_OFFS));
        }
        else {
            APP(&ilist, XINST_CREATE_store
                (dcontext, OPND_TLS_FIELD(TLS_SLOT_REG3),
                 OPND_CREATE_INTPTR((ptr_int_t)get_shared_syscalls_unlinked_linkstub())));
        }
        if (!DYNAMO_OPTION(atomic_inlined_linking)) {
            /* we need to duplicate the emit_inline_ibl_stub unlinking race
             * condition detection code here, before we jump to unlink
             */
            /*
             * # set flag in xcx (bottom byte = 0x1) so that unlinked path can
             * # detect race condition during unlinking
             * 2   movb  $0x1, %cl
            */
            /* we expect target saved in xbx_offset */
            if (absolute) {
                APP(&ilist,
                    instr_create_save_to_dcontext(dcontext, SCRATCH_REG2, SCRATCH_REG1_OFFS));
            } else
                APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG2, TLS_SLOT_REG1));
            APP(&ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_CL),
                                             OPND_CREATE_INT8(1)));
        } else {
            /* xbx could have changed in kernel, unlink expects it saved */
            if (absolute) {
                APP(&ilist,
                    instr_create_save_to_dcontext(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS));
            } else
                APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG1, TLS_SLOT_REG1));
        }
    }
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(unlinked_ib_lookup_pc)));

    pc += encode_with_patch_list(dcontext, patch, &ilist, pc);
    if (syscall_method == SYSCALL_METHOD_SYSENTER) {
        ASSERT(after_syscall_ptr != 0);
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        *((uint *)(ptr_uint_t) after_syscall_ptr) =
            (uint)(ptr_uint_t) (code->unlinked_shared_syscall + code->sys_syscall_offs);
    }
    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}


static byte *
emit_dispatch_template(dcontext_t *dcontext, byte *pc, uint offset)
{
    instrlist_t ilist;

    /* PR 244737: we don't use this for x64 b/c syscall routines are thread-shared */
    IF_X64(ASSERT_NOT_IMPLEMENTED(false));

    /* initialize the ilist */
    instrlist_init(&ilist);

    /* load %edi w/the dcontext */
    insert_shared_get_dcontext(dcontext, &ilist, NULL, true);

    /* load the generated_code_t address */
    APP(&ilist, XINST_CREATE_load(dcontext, opnd_create_reg(REG_EDI),
                                  OPND_DC_FIELD(false, dcontext, OPSZ_PTR,
                                                PRIVATE_CODE_OFFSET)));

    /* jump thru the address in the offset */
    APP(&ilist, XINST_CREATE_jump_mem(dcontext, OPND_CREATE_MEM32(REG_EDI, offset)));

    pc = instrlist_encode(dcontext, &ilist, pc, false /* no instr targets */);
    ASSERT(pc != NULL);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}

byte *
emit_shared_syscall_dispatch(dcontext_t *dcontext, byte *pc)
{
    return emit_dispatch_template(dcontext, pc,
                                  offsetof(generated_code_t, shared_syscall));
}

byte *
emit_unlinked_shared_syscall_dispatch(dcontext_t *dcontext, byte *pc)
{
    return emit_dispatch_template(dcontext, pc, offsetof(generated_code_t,
                                                         unlinked_shared_syscall));
}

/* Links the shared_syscall routine to go directly to the indirect branch
 * lookup routine.
 * If it is already linked, does nothing.
 * Assumes caller takes care of any synchronization if this is called
 * from other than the owning thread!
 */
/* NOTE the link/unlink of shared syscall is atomic w/respect to threads in the
 * cache since is only single byte write (always atomic). */
static void
link_shared_syscall_common(generated_code_t *code)
{
    /* strategy: change "jmp unlink" back to "jecxz unlink" */
    cache_pc pc;
    if (code == NULL) /* shared_code_x86 */
        return;
    pc = code->unlinked_shared_syscall + code->sys_unlink_offs;
    if (*pc != JECXZ_OPCODE) {
        protect_generated_code(code, WRITABLE);
        ASSERT(*pc == JMP_SHORT_OPCODE);
        *pc = JECXZ_OPCODE;
        protect_generated_code(code, READONLY);
    }
}

void
link_shared_syscall(dcontext_t *dcontext)
{
    ASSERT(IS_SHARED_SYSCALL_THREAD_SHARED || dcontext != GLOBAL_DCONTEXT);
    if (dcontext == GLOBAL_DCONTEXT) {
        link_shared_syscall_common(SHARED_GENCODE(GENCODE_X64));
#ifdef X64
        /* N.B.: there are no 32-bit syscalls for WOW64 with 64-bit DR (i#821) */
        if (DYNAMO_OPTION(x86_to_x64))
            link_shared_syscall_common(SHARED_GENCODE(GENCODE_X86_TO_X64));
#endif
    } else
        link_shared_syscall_common(THREAD_GENCODE(dcontext));
}

/* Unlinks the shared_syscall routine so it goes back to dispatch after
 * the system call itself.
 * If it is already unlinked, does nothing.
 * Assumes caller takes care of any synchronization if this is called
 * from other than the owning thread!
 */
static void
unlink_shared_syscall_common(generated_code_t *code)
{
    /* strategy: change "jecxz unlink" to "jmp unlink" */
    cache_pc pc;
    if (code == NULL) /* shared_code_x86 */
        return;
    pc = code->unlinked_shared_syscall + code->sys_unlink_offs;
    if (*pc != JMP_SHORT_OPCODE) {
        protect_generated_code(code, WRITABLE);
        ASSERT(*pc == JECXZ_OPCODE);
        *pc = JMP_SHORT_OPCODE;
        protect_generated_code(code, READONLY);
    }
}

void
unlink_shared_syscall(dcontext_t *dcontext)
{
    ASSERT(IS_SHARED_SYSCALL_THREAD_SHARED || dcontext != GLOBAL_DCONTEXT);
    if (dcontext == GLOBAL_DCONTEXT) {
        unlink_shared_syscall_common(SHARED_GENCODE(GENCODE_X64));
#ifdef X64
        /* N.B.: there are no 32-bit syscalls for WOW64 with 64-bit DR (i#821) */
        if (DYNAMO_OPTION(x86_to_x64))
            unlink_shared_syscall_common(SHARED_GENCODE(GENCODE_X86_TO_X64));
#endif
    } else
        unlink_shared_syscall_common(THREAD_GENCODE(dcontext));
}

#endif /* defined(WINDOWS) ****************************/

#ifdef WINDOWS
/* used by detach, this inlines the callback stack so that we can detach
 *
 * we spill xax and xbx to the PID and TID (respectively) TLS slots until we find
 * the thread private state at which point we switch to using it for spilling.  We
 * use the TID slot (as opposed to the PEB slot that callback.c uses) because we need
 * to get the TID anyways.
 *
 * note the counter walks backwards through the array of saved address (they are
 * stored in reverse order)
 *
 * FIXME - we clobber eflags, but those should be dead after a system call anyways.
 *
 * From emit_patch_syscall()
 * after_shared_syscall:
 *   jmp _after_do_syscall
 *
 * after_do_syscall:
 *   mov xax -> PID in TEB
 *   mov &callback_buf -> xax
 *   jmp xax
 *
 *
 * From emit_detach_callback_code()
 * // xax is currently saved in PID slot of TEB
 *  callback_buf:
 *   xchg xbx, TID in TEB  // store xbx and get TID
 *   mov &callback_state -> xax  //the array of detach_callback_stack_t
 *  match_tid:
 *   cmp xbx, thread_id_offset(xax)
 *   je match_found
 *   add xax, sizeof(detach_callback_stack_t)
 *   jmp match_tid  // Note - infinite loop till find or crash (not clear what else to do)
 *  match_found:  // xax now holds ptr to the detach_callback_stack_t for this thread
 *   xchg xbx, TID in TEB  // restore tid & xbx
 *   mov xbx -> xbx_save_offset(xax)
 *   mov PID -> xbx
 *   xchg xbx, PID in TEB  // restore pid, saved xax now in xbx
 *   mov xbx -> xax_save_offset(xax)
 *   mov xcx -> xcx_save_offset(xax)
 *   mov count_offset(xax) -> xbx  // need count in register for addr calculation below
 *   sub xbx, 1
 *   mov xbx -> count_offset(xax)
 *   mov callback_addrs_offset(xax) -> xcx
 *   mov (xcx + xbx*sizeof(app_pc)) -> xcx // xcx now holds the xip we need to go to
 *   mov xcx -> target_offset(xax)
 *   mov xcx_save_offset(xax) -> xcx
 *   mov xbx_save_offset(xax) -> xbx
 *   lea code_buf_offset(xax) -> xax
 *   jmp xax
 *
 214f1000 6764871e2400     xchg    fs:[0024],ebx
 214f1006 b800114f21       mov     eax,0x214f1100
 214f100b 3b18             cmp     ebx,[eax]
 214f100d 0f8408000000     je      214f101b
 214f1013 83c03c           add     eax,0x3c
 214f1016 e9f0ffffff       jmp     214f100b
 214f101b 6764871e2400     xchg    fs:[0024],ebx
 214f1021 895810           mov     [eax+0x10],ebx
 214f1024 bb5c040000       mov     ebx,0x45c
 214f1029 6764871e2000     xchg    fs:[0020],ebx
 214f102f 89580c           mov     [eax+0xc],ebx
 214f1032 894814           mov     [eax+0x14],ecx
 214f1035 8b5804           mov     ebx,[eax+0x4]
 214f1038 83eb01           sub     ebx,0x1
 214f103b 895804           mov     [eax+0x4],ebx
 214f103e 8b4808           mov     ecx,[eax+0x8]
 214f1041 8b0c99           mov     ecx,[ecx+ebx*4]
 214f1044 894818           mov     [eax+0x18],ecx
 214f1047 8b4814           mov     ecx,[eax+0x14]
 214f104a 8b5810           mov     ebx,[eax+0x10]
 214f104d 8d401c           lea     eax,[eax+0x1c]
 214f1050 ffe0             jmp     eax
 *
 *
 * From emit_detach_callback_final_jmp()
 * _detach_callback_stack_t.code_buf (thread private)
 *   mov (xax_save_offset) -> xax
 *   jmp *target
 *
 214f111c a10c114f21       mov     eax,[214f110c]
 214f1121 ff2518114f21     jmp     dword ptr [214f1118]
 */
byte *
emit_detach_callback_code(dcontext_t *dcontext, byte *buf,
                          detach_callback_stack_t *callback_state)
{
    byte *pc = buf;
    instrlist_t ilist;
    instr_t *match_tid = INSTR_CREATE_label(dcontext),
        *match_found = INSTR_CREATE_label(dcontext);

    /* i#821/PR 284029: for now we assume there are no syscalls in x86 code, so
     * we do not need to generate an x86 version
     */

    /* initialize the ilist */
    instrlist_init(&ilist);

    /* create instructions */
    APP(&ilist, INSTR_CREATE_xchg(dcontext, opnd_create_tls_slot(TID_TIB_OFFSET),
                                  opnd_create_reg(SCRATCH_REG1)));
    APP(&ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(SCRATCH_REG0),
                                     OPND_CREATE_INTPTR((ptr_uint_t)callback_state)));
    APP(&ilist, match_tid);
    /* FIXME - we clobber eflags.  We don't anticipate that being a problem on callback
     * returns since syscalls clobber eflags too. */
    APP(&ilist, INSTR_CREATE_cmp
        (dcontext, opnd_create_reg(SCRATCH_REG1),
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, tid))));
    APP(&ilist, INSTR_CREATE_jcc_short(dcontext, OP_je, opnd_create_instr(match_found)));
    APP(&ilist, INSTR_CREATE_add(dcontext, opnd_create_reg(SCRATCH_REG0),
                                 OPND_CREATE_INT_32OR8(sizeof(detach_callback_stack_t))));
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_instr(match_tid)));
    APP(&ilist, match_found);
    /* found matching tid ptr is in xax
     * spill registers into local slots and restore TEB fields */
    APP(&ilist, INSTR_CREATE_xchg(dcontext, opnd_create_tls_slot(TID_TIB_OFFSET),
                                  opnd_create_reg(SCRATCH_REG1)));
    APP(&ilist, XINST_CREATE_store
        (dcontext,
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, xbx_save)),
         opnd_create_reg(SCRATCH_REG1)));
    APP(&ilist, INSTR_CREATE_mov_imm
        (dcontext, opnd_create_reg(SCRATCH_REG1),
         OPND_CREATE_INTPTR((ptr_uint_t)get_process_id())));
    APP(&ilist, INSTR_CREATE_xchg(dcontext, opnd_create_tls_slot(PID_TIB_OFFSET),
                                  opnd_create_reg(SCRATCH_REG1)));
    APP(&ilist, XINST_CREATE_store
        (dcontext,
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, xax_save)),
         opnd_create_reg(SCRATCH_REG1)));
    APP(&ilist, XINST_CREATE_store
        (dcontext,
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, xcx_save)),
         opnd_create_reg(SCRATCH_REG2)));
    /* now find the right address and move it into target while updating the
     * thread private count */
    APP(&ilist, XINST_CREATE_load
        (dcontext, opnd_create_reg(SCRATCH_REG1),
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, count))));
    /* see earlier comment on clobbering eflags */
    APP(&ilist, INSTR_CREATE_sub(dcontext, opnd_create_reg(SCRATCH_REG1),
                                 OPND_CREATE_INT8(1)));
    APP(&ilist, XINST_CREATE_store
        (dcontext,
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, count)),
         opnd_create_reg(SCRATCH_REG1)));
    APP(&ilist, XINST_CREATE_load
        (dcontext, opnd_create_reg(SCRATCH_REG2),
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, callback_addrs))));
    APP(&ilist, XINST_CREATE_load
        (dcontext, opnd_create_reg(SCRATCH_REG2),
         opnd_create_base_disp(SCRATCH_REG2, SCRATCH_REG1, sizeof(app_pc), 0, OPSZ_PTR)));
    APP(&ilist, XINST_CREATE_store
        (dcontext,
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, target)),
         opnd_create_reg(SCRATCH_REG2)));
    APP(&ilist, XINST_CREATE_load
        (dcontext, opnd_create_reg(SCRATCH_REG2),
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, xcx_save))));
    APP(&ilist, XINST_CREATE_load
        (dcontext, opnd_create_reg(SCRATCH_REG1),
         OPND_CREATE_MEMPTR(SCRATCH_REG0, offsetof(detach_callback_stack_t, xbx_save))));
    APP(&ilist, INSTR_CREATE_lea
        (dcontext, opnd_create_reg(SCRATCH_REG0),
         OPND_CREATE_MEM_lea(SCRATCH_REG0, REG_NULL, 0,
                             offsetof(detach_callback_stack_t, code_buf))));
    APP(&ilist, INSTR_CREATE_jmp_ind(dcontext, opnd_create_reg(SCRATCH_REG0)));

    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, true /* instr targets */);
    ASSERT(pc != NULL);
    ASSERT(pc - buf < DETACH_CALLBACK_CODE_SIZE);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}

void
emit_detach_callback_final_jmp(dcontext_t *dcontext,
                               detach_callback_stack_t *callback_state)
{
    byte *pc = callback_state->code_buf;
    instrlist_t ilist;

    /* initialize the ilist */
    instrlist_init(&ilist);

    /* restore eax and jmp target */
    APP(&ilist, XINST_CREATE_load
        (dcontext, opnd_create_reg(SCRATCH_REG0),
         OPND_CREATE_ABSMEM(&(callback_state->xax_save), OPSZ_PTR)));
    APP(&ilist, INSTR_CREATE_jmp_ind
        (dcontext, OPND_CREATE_ABSMEM(&(callback_state->target), OPSZ_PTR)));

    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, true /* instr targets */);
    ASSERT(pc != NULL);
    ASSERT(pc - callback_state->code_buf < DETACH_CALLBACK_FINAL_JMP_SIZE);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);
}


void
emit_patch_syscall(dcontext_t *dcontext, byte *target _IF_X64(gencode_mode_t mode))
{
    byte *pc = after_do_syscall_code_ex(dcontext _IF_X64(mode));
    instrlist_t ilist;

    if (DYNAMO_OPTION(shared_syscalls)) {
        /* Simply patch shared_syscall to jump to after_do_syscall. Only
         * one array of callback stack addresses is needed -- a return from
         * a callback entered from shared_syscall will jump to the patched
         * after_do_syscall and fetch the correct address off of our
         * callback stack copy. It "just works".
         */
        instr_t *instr = XINST_CREATE_jump(dcontext, opnd_create_pc(pc));
        DEBUG_DECLARE(byte *nxt_pc =)
            instr_encode(dcontext, instr,
                         after_shared_syscall_code_ex(dcontext _IF_X64(mode)));
        ASSERT(nxt_pc != NULL);
        /* check that there was room - shared_syscall should be before do_syscall
         * anything between them is dead at this point */
        ASSERT(after_shared_syscall_code_ex(dcontext _IF_X64(mode)) < pc && nxt_pc < pc);
        instr_destroy(dcontext, instr);
        LOG(THREAD, LOG_EMIT, 2,
            "Finished patching shared syscall routine for detach -- patch "PFX
            " to jump to "PFX"\n", after_shared_syscall_code(dcontext), pc);
    }

    /* initialize the ilist */
    instrlist_init(&ilist);

    /* patch do_syscall to jmp to target */
    /* Note that on 64-bit target may not be reachable in which case we need to inline
     * the first register spill here so we can jmp reg. We go ahead and the spill here
     * and jmp through reg for 32-bit as well for consistency. */
    APP(&ilist, XINST_CREATE_store(dcontext, opnd_create_tls_slot(PID_TIB_OFFSET),
                                   opnd_create_reg(SCRATCH_REG0)));
    APP(&ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(SCRATCH_REG0),
                                     OPND_CREATE_INTPTR((ptr_uint_t)target)));
    APP(&ilist, INSTR_CREATE_jmp_ind(dcontext, opnd_create_reg(SCRATCH_REG0)));

    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, true /* instr targets */);
    ASSERT(pc != NULL);
    /* ASSERT that there was enough space after the system call (everything after
     * do_syscall should be dead at this point). */
    ASSERT(pc <= get_emitted_routines_code(dcontext _IF_X64(mode))->commit_end_pc);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);
}
#endif /* WINDOWS */

/* this routine performs a single system call instruction and then returns
 * to dynamo via fcache_return
 */
static byte *
emit_do_syscall_common(dcontext_t *dcontext, generated_code_t *code,
                       byte *pc, byte *fcache_return_pc,
                       bool handle_clone, bool thread_shared, int interrupt,
                       instr_t *syscall_instr, uint *syscall_offs /*OUT*/)
{
    instrlist_t ilist;
    instr_t *syscall;
#ifdef UNIX
    instr_t *post_syscall;
#endif

#if defined(UNIX) && !defined(X64)
    /* PR 286922: 32-bit clone syscall cannot use vsyscall: must be int */
    if (handle_clone) {
        ASSERT(interrupt == 0 || interrupt == 0x80);
        interrupt = 0x80;
    }
#endif
    if (syscall_instr != NULL)
        syscall = syscall_instr;
    else {
        if (interrupt != 0) {
#ifdef X86
            syscall = INSTR_CREATE_int(dcontext,
                                       opnd_create_immed_int((char)interrupt, OPSZ_1));
#elif defined(ARM)
            /* FIXMED i#1551: NYI on ARM */
            ASSERT_NOT_IMPLEMENTED(false);
#endif
        } else
            syscall = create_syscall_instr(dcontext);
    }

    /* i#821/PR 284029: for now we assume there are no syscalls in x86 code.
     */
    IF_X64(ASSERT_NOT_IMPLEMENTED(!GENCODE_IS_X86(code->gencode_mode)));

    ASSERT(syscall_offs != NULL);
    *syscall_offs = instr_length(dcontext, syscall);

    /* initialize the ilist */
    instrlist_init(&ilist);

    /* system call itself -- using same method we've observed OS using */
    APP(&ilist, syscall);
#ifdef UNIX
# ifdef X86
    if (get_syscall_method() == SYSCALL_METHOD_UNINITIALIZED) {
        /* Since we lazily find out the method, but emit these routines
         * up front, we have to leave room for the longest syscall method.
         * This used to the 6-byte LOL64 call* but we now walk into that
         * call* (PR 286922).  Not much of a perf worry, but if we
         * ever have proactive syscall determination on linux we should
         * remove these nops.
         */
        ASSERT(instr_length(dcontext, instrlist_last(&ilist)) == 2);
        if (SYSCALL_METHOD_LONGEST_INSTR == 6) {
            /* we could add 4-byte nop support but I'm too lazy */
            APP(&ilist, INSTR_CREATE_nop3byte(dcontext));
            APP(&ilist, INSTR_CREATE_nop1byte(dcontext));
        } else
            ASSERT_NOT_IMPLEMENTED(instr_length(dcontext, instrlist_last(&ilist)) ==
                                   SYSCALL_METHOD_LONGEST_INSTR);
    }
# elif defined(ARM)
    /* FIXMED i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
# endif
    post_syscall = instrlist_last(&ilist);
#endif

    /* go to fcache return -- use special syscall linkstub */
    /* in case it returns: go to fcache return -- use 0 as &linkstub */
    if (thread_shared)
        APP(&ilist, instr_create_save_to_tls(dcontext, SCRATCH_REG0, TLS_SLOT_REG0));
    else
        APP(&ilist, instr_create_save_to_dcontext(dcontext, SCRATCH_REG0,
                                                  SCRATCH_REG0_OFFS));
    APP(&ilist, XINST_CREATE_load_int
        (dcontext, opnd_create_reg(SCRATCH_REG0),
         OPND_CREATE_INTPTR((ptr_int_t)get_syscall_linkstub())));
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(fcache_return_pc)));

#ifdef UNIX
    if (handle_clone) {
        /* put in clone code, and make sure to target it.
         * do it here since it assumes an instr after the syscall exists.
         */
        mangle_insert_clone_code(dcontext, &ilist, post_syscall,
                                 false /*do not skip*/
                                 _IF_X64(code->gencode_mode));
    }
#endif

    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc,
#ifdef UNIX
                          handle_clone /* instr targets */
#else
                          false /* no instr targets */
#endif
                          );
    ASSERT(pc != NULL);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}

#ifdef WINDOWS
/* like fcache_enter but indirects the dcontext passed in through edi */
byte *
emit_fcache_enter_indirect(dcontext_t *dcontext, generated_code_t *code,
                           byte *pc, byte *fcache_return_pc)
{
    return emit_fcache_enter_common(dcontext, code, pc,
                                    false/*indirect*/, false/*!shared*/);
}

/* This routine performs an int 2b, which maps to NtCallbackReturn, and then returns
 * to dynamo via fcache_return (though it won't reach there)
 */
byte *
emit_do_callback_return(dcontext_t *dcontext, byte *pc, byte *fcache_return_pc,
                        bool thread_shared)
{
    instrlist_t ilist;

    /* initialize the ilist */
    instrlist_init(&ilist);

    /* interrupt 2b */
    APP(&ilist, INSTR_CREATE_int(dcontext, opnd_create_immed_int(0x2b, OPSZ_1)));

    /* in case it returns: go to fcache return -- use 0 as &linkstub */
    if (thread_shared)
        APP(&ilist, instr_create_save_to_tls(dcontext, SCRATCH_REG0, TLS_SLOT_REG0));
    else
        APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EAX, SCRATCH_REG0_OFFS));
    /* for x64 we rely on sign-extension to fill out rax */
    APP(&ilist, INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_EAX),
                                     OPND_CREATE_INT32(0)));
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(fcache_return_pc)));

    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, false /* no instr targets */);
    ASSERT(pc != NULL);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}
#else /* !WINDOWS => UNIX */
byte *
emit_do_clone_syscall(dcontext_t *dcontext, generated_code_t *code, byte *pc,
                      byte *fcache_return_pc, bool thread_shared,
                      uint *syscall_offs /*OUT*/)
{
    return emit_do_syscall_common(dcontext, code, pc, fcache_return_pc,
                                  true, thread_shared, false, NULL, syscall_offs);
}
# ifdef VMX86_SERVER
byte *
emit_do_vmkuw_syscall(dcontext_t *dcontext, generated_code_t *code, byte *pc,
                      byte *fcache_return_pc, bool thread_shared,
                      uint *syscall_offs /*OUT*/)
{
    instr_t *gateway = INSTR_CREATE_int
        (dcontext, opnd_create_immed_int((char)VMKUW_SYSCALL_GATEWAY, OPSZ_1));
    return emit_do_syscall_common(dcontext, code, pc, fcache_return_pc,
                                  false, thread_shared, false, gateway, syscall_offs);
}
# endif
#endif /* UNIX */

byte *
emit_do_syscall(dcontext_t *dcontext, generated_code_t *code, byte *pc,
                byte *fcache_return_pc, bool thread_shared, int interrupt,
                uint *syscall_offs /*OUT*/)
{
    pc = emit_do_syscall_common(dcontext, code, pc, fcache_return_pc,
                                false, thread_shared, interrupt, NULL, syscall_offs);
    return pc;
}

#ifndef WINDOWS
/* updates first syscall instr it finds with the new method of syscall */
static void
update_syscall(dcontext_t *dcontext, byte *pc)
{
    LOG_DECLARE(byte *start_pc = pc;)
    byte *prev_pc;
    instr_t instr;
    instr_init(dcontext, &instr);

    do {
        prev_pc = pc;
        instr_reset(dcontext, &instr);
        pc = decode_cti(dcontext, pc, &instr);
        ASSERT(pc != NULL); /* this our own code we're decoding, should be valid */
        if (instr_is_syscall(&instr)) {
            instr_t *newinst = create_syscall_instr(dcontext);
            byte *nxt_pc = instr_encode(dcontext, newinst, prev_pc);
            /* instruction must not change size! */
            ASSERT(nxt_pc != NULL);
            if (nxt_pc != pc) {
                pc = nxt_pc;
                byte *stop_pc = prev_pc + SYSCALL_METHOD_LONGEST_INSTR;
                ASSERT(nxt_pc <= stop_pc);
                while (pc < stop_pc) {
                    /* we could add >3-byte nop support but I'm too lazy */
                    int noplen = MIN(stop_pc - pc, 3);
                    instr_t *nop = instr_create_nbyte_nop(dcontext, noplen, true);
                    pc = instr_encode(dcontext, nop, pc);
                    ASSERT(pc != NULL);
                    instr_destroy(dcontext, nop);
                }
            }
            instr_destroy(dcontext, newinst);
            break;
        }
        ASSERT(pc - prev_pc < 128);
    } while (1);

    instr_free(dcontext, &instr);

    DOLOG(3, LOG_EMIT, {
        LOG(THREAD, LOG_EMIT, 3, "Just updated syscall routine:\n");
        prev_pc = pc;
        pc = start_pc;
        do {
            pc = disassemble_with_bytes(dcontext, pc, THREAD);
        } while (pc < prev_pc + 1); /* +1 to get next instr */
        LOG(THREAD, LOG_EMIT, 3, "  ...\n");
    });
}

void
update_syscalls(dcontext_t *dcontext)
{
    byte *pc;
    pc = get_do_syscall_entry(dcontext);
    update_syscall(dcontext, pc);
#ifdef X64
    /* PR 286922: for 32-bit, we do NOT update the clone syscall as it
     * always uses int (since can't use call to vsyscall when swapping
     * stacks!)
     */
    pc = get_do_clone_syscall_entry(dcontext);
    update_syscall(dcontext, pc);
#endif
}
#endif /* !WINDOWS */

/* Returns -1 on failure */
int
decode_syscall_num(dcontext_t *dcontext, byte *entry)
{
    byte *pc;
    int syscall = -1;
    instr_t instr;
    ASSERT(entry != NULL);
    instr_init(dcontext, &instr);
    pc = entry;
    LOG(GLOBAL, LOG_EMIT, 3, "decode_syscall_num "PFX"\n", entry);
    while (true) {
        DOLOG(3, LOG_EMIT, { disassemble_with_bytes(dcontext, pc, GLOBAL); });
        instr_reset(dcontext, &instr);
        pc = decode(dcontext, pc, &instr);
        if (pc == NULL)
            break; /* give up gracefully */
        /* we do not handle control transfer instructions! */
        if (instr_is_cti(&instr)) {
#ifdef WINDOWS /* since no interception code buffer to check on linux */
            if (DYNAMO_OPTION(native_exec_syscalls) && instr_is_ubr(&instr)) {
                /* probably our own trampoline, follow it
                 * ASSUMPTION: mov eax is the instr that jmp targets: i.e.,
                 * we don't handle deep hooks here.
                 */
                if (!is_syscall_trampoline(opnd_get_pc(instr_get_target(&instr)), &pc)) {
                    break;  /* give up gracefully */
                } /* else, carry on at pc */
            } else
#endif
                break; /* give up gracefully */
        }
        if (instr_num_dsts(&instr) > 0 &&
            opnd_is_reg(instr_get_dst(&instr, 0)) &&
            opnd_get_reg(instr_get_dst(&instr, 0)) == SCRATCH_REG0) {
            if (instr_get_opcode(&instr) == IF_X86_ELSE(OP_mov_imm, OP_mov)) {
                IF_X64(ASSERT_TRUNCATE(int, int,
                                       opnd_get_immed_int(instr_get_src(&instr, 0))));
                syscall = (int) opnd_get_immed_int(instr_get_src(&instr, 0));
                LOG(GLOBAL, LOG_EMIT, 3, "\tfound syscall num: 0x%x\n", syscall);
                break;
            } else
                break; /* give up gracefully */
        }
    }
    instr_free(dcontext, &instr);
    return syscall;
}

#ifdef UNIX
/* PR 212290: can't be static code in x86.asm since it can't be PIC */
/*
 * new_thread_dynamo_start - for initializing a new thread created
 * via the clone system call.
 * assumptions:
 *   1) app's xcx is in xax.
 *   2) xcx contains clone_record_t, which is not 0.
 *   3) app's xax should contain 0.
 *   4) this thread holds initstack_mutex
 */
byte *
emit_new_thread_dynamo_start(dcontext_t *dcontext, byte *pc)
{
    instrlist_t ilist;
#ifdef X86
    uint offset;
#endif

    /* initialize the ilist */
    instrlist_init(&ilist);

    /* Since we don't have TLS available here (we could use CLONE_SETTLS
     * for kernel 2.5.32+: PR 285898) we can't non-racily acquire
     * initstack_mutex as we can't spill or spare a register
     * (xref i#101/PR 207903).
     */

    /* Restore app xcx and xax, which were swapped post-syscall to distinguish
     * parent from child.
     */
#ifdef X86
    APP(&ilist, INSTR_CREATE_xchg
        (dcontext, opnd_create_reg(SCRATCH_REG0), opnd_create_reg(SCRATCH_REG2)));

    /* grab exec state and pass as param in a priv_mcontext_t struct
     * new_thread_setup will restore real app xsp and xax
     * we emulate x86.asm's PUSH_DR_MCONTEXT(SCRATCH_REG0) (for priv_mcontext_t.pc)
     */
    offset = insert_push_all_registers(dcontext, NULL, &ilist, NULL,
                                       IF_X64_ELSE(16, 4),
                                       INSTR_CREATE_push(dcontext,
                                                         opnd_create_reg(SCRATCH_REG0)));
    /* put pre-push xsp into priv_mcontext_t.xsp slot */
    ASSERT(offset == sizeof(priv_mcontext_t));
    APP(&ilist, INSTR_CREATE_lea
        (dcontext, opnd_create_reg(SCRATCH_REG0),
         OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0, sizeof(priv_mcontext_t))));
    APP(&ilist, XINST_CREATE_store
        (dcontext, OPND_CREATE_MEMPTR(REG_XSP, offsetof(priv_mcontext_t, xsp)),
         opnd_create_reg(SCRATCH_REG0)));

    /* We avoid get_thread_id syscall in get_thread_private_dcontext()
     * by clearing the segment register here (cheaper check than syscall)
     * (xref PR 192231).  If we crash prior to this point though, the
     * signal handler will get the wrong dcontext, but that's a small window.
     * See comments in get_thread_private_dcontext() for alternatives.
     */
    APP(&ilist, XINST_CREATE_load_int
        (dcontext, opnd_create_reg(REG_AX), OPND_CREATE_INT16(0)));
    APP(&ilist, INSTR_CREATE_mov_seg
        (dcontext, opnd_create_reg(SEG_TLS), opnd_create_reg(REG_AX)));
    /* stack grew down, so priv_mcontext_t at tos */
    APP(&ilist, INSTR_CREATE_lea
        (dcontext, opnd_create_reg(SCRATCH_REG0),
         OPND_CREATE_MEM_lea(REG_XSP, REG_NULL, 0, 0)));
#elif defined(ARM)
    /* FIXMED i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
#endif

    dr_insert_call(dcontext, &ilist, NULL, (void *)new_thread_setup,
                   1, opnd_create_reg(SCRATCH_REG0));

    /* should not return */
    insert_reachable_cti(dcontext, &ilist, NULL, vmcode_get_start(),
                         (byte *)unexpected_return, true/*jmp*/, false/*!precise*/,
                         DR_REG_R11/*scratch*/, NULL);

    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, true /* instr targets */);
    ASSERT(pc != NULL);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}
#endif /* UNIX */

#ifdef TRACE_HEAD_CACHE_INCR
/* trace_t heads come here instead of back to dynamo to have their counters
 * incremented.
 */
byte *
emit_trace_head_incr(dcontext_t *dcontext, byte *pc, byte *fcache_return_pc)
{
    /*  save ecx
        save eax->xbx slot
        mov target_fragment_offs(eax), eax
        movzx counter_offs(eax), ecx
        lea 1(ecx), ecx                 # increment counter
        mov data16 cx, counter_offs(eax)
        lea -hot_threshold(ecx), ecx    # compare to hot_threshold
        jecxz is_hot
        mov start_pc_offs(eax), ecx
        movzx prefix_size_offs(eax), eax
        lea (ecx,eax,1), ecx
        mov ecx, trace_head_pc_offs + dcontext   # special slot to avoid target prefix
        restore ecx
        restore eax
        jmp * trace_head_pc_offs + dcontext
      is_hot:
        restore ebx slot to eax         # put &l into eax
        restore ecx
        jmp fcache_return
     */
    instrlist_t ilist;
    instr_t *is_hot =
        instr_create_restore_from_dcontext(dcontext, REG_EAX, SCRATCH_REG1_OFFS);
    instr_t *in;

    /* PR 248210: unsupported feature on x64 */
    IF_X64(ASSERT_NOT_IMPLEMENTED(false));

    instrlist_init(&ilist);
    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS));
    if (DYNAMO_OPTION(shared_bbs)) {
        /* HACK to get shared exit stub, which puts eax into fs:scratch1, to work
         * w/ thread-private THCI: we pull eax out of the tls slot and into mcontext.
         * This requires that all direct stubs for cti that can link to trace
         * heads use the shared stub -- so if traces can link to trace heads, their
         * exits must use the shared stubs, even if the traces are thread-private.
         */
        APP(&ilist, RESTORE_FROM_TLS(dcontext, REG_ECX, EXIT_STUB_SPILL_SLOT));
        APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ECX, SCRATCH_REG0_OFFS));
    }
    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_EAX, SCRATCH_REG1_OFFS));
    APP(&ilist, XINST_CREATE_load(dcontext, opnd_create_reg(REG_EAX),
                                  OPND_CREATE_MEM32(REG_EAX, LINKSTUB_TARGET_FRAG_OFFS)));
    ASSERT_NOT_IMPLEMENTED(false &&
                           "must handle LINKSTUB_CBR_FALLTHROUGH case"
                           " by calculating target tag")
    APP(&ilist, INSTR_CREATE_movzx(dcontext, opnd_create_reg(REG_ECX),
                                   opnd_create_base_disp(REG_EAX, REG_NULL, 0,
                                                         FRAGMENT_COUNTER_OFFS, OPSZ_2)));
    APP(&ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_ECX),
                              opnd_create_base_disp(REG_ECX, REG_NULL, 0, 1, OPSZ_lea)));
    /* data16 prefix is set auto-magically */
    APP(&ilist, XINST_CREATE_store(dcontext,
                                   opnd_create_base_disp(REG_EAX, REG_NULL, 0,
                                                         FRAGMENT_COUNTER_OFFS, OPSZ_2),
                                   opnd_create_reg(REG_CX)));
    APP(&ilist, INSTR_CREATE_lea
        (dcontext, opnd_create_reg(REG_ECX),
         opnd_create_base_disp(REG_ECX, REG_NULL, 0,
                               -((int)INTERNAL_OPTION(trace_threshold)), OPSZ_lea)));
    APP(&ilist, INSTR_CREATE_jecxz(dcontext, opnd_create_instr(is_hot)));
    APP(&ilist, XINST_CREATE_load(dcontext, opnd_create_reg(REG_ECX),
                                  OPND_CREATE_MEM32(REG_EAX, FRAGMENT_START_PC_OFFS)));
    APP(&ilist, INSTR_CREATE_movzx(dcontext, opnd_create_reg(REG_EAX),
                                   opnd_create_base_disp(REG_EAX, REG_NULL, 0,
                                                         FRAGMENT_PREFIX_SIZE_OFFS,
                                                         OPSZ_1)));
    APP(&ilist, INSTR_CREATE_lea(dcontext, opnd_create_reg(REG_ECX),
                                 opnd_create_base_disp(REG_ECX, REG_EAX, 1, 0, OPSZ_lea)));
    APP(&ilist, instr_create_save_to_dcontext(dcontext, REG_ECX, TRACE_HEAD_PC_OFFSET));
    APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS));
    APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_EAX, SCRATCH_REG0_OFFS));
    APP(&ilist, INSTR_CREATE_jmp_ind(dcontext,
                          opnd_create_dcontext_field(dcontext, TRACE_HEAD_PC_OFFSET)));
    APP(&ilist, is_hot);
    APP(&ilist, instr_create_restore_from_dcontext(dcontext, REG_ECX, SCRATCH_REG2_OFFS));
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(fcache_return_pc)));

    /* now encode the instructions */
    pc = instrlist_encode(dcontext, &ilist, pc, true /* instr targets */);
    ASSERT(pc != NULL);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}

byte *
emit_trace_head_incr_shared(dcontext_t *dcontext, byte *pc, byte *fcache_return_pc)
{
    ASSERT_NOT_IMPLEMENTED(false);
}

#endif /* TRACE_HEAD_CACHE_INCR */

/***************************************************************************
 * SPECIAL IBL XFER ROUTINES
 */

static inline byte *
special_ibl_xfer_tgt(dcontext_t *dcontext, generated_code_t *code,
                     ibl_entry_point_type_t entry_type,
                     ibl_branch_type_t ibl_type)
{
    /* We use the trace ibl so that the target will be a trace head,
     * avoiding a trace disruption.
     * We request that bbs doing this xfer are marked DR_EMIT_MUST_END_TRACE.
     * We use the ret ibt b/c we figure most uses will involve rets and there's
     * no reason to fill up the jmp ibt.
     * This feature is unavail for prog shep b/c of the cross-type pollution.
     */
    return get_ibl_routine_ex(dcontext, entry_type,
                              DYNAMO_OPTION(disable_traces) ?
                              (code->thread_shared ? IBL_BB_SHARED : IBL_BB_PRIVATE) :
                              (code->thread_shared ? IBL_TRACE_SHARED : IBL_TRACE_PRIVATE),
                              ibl_type
                              _IF_X64(code->gencode_mode));
}

/* We only need a thread-private version if our ibl target is thread-private */
bool
special_ibl_xfer_is_thread_private(void)
{
#ifdef X64
    return false; /* all gencode is shared */
#else
    return (DYNAMO_OPTION(disable_traces) ?
            !DYNAMO_OPTION(shared_bbs) : !DYNAMO_OPTION(shared_traces));
#endif
}

/* emit the special_ibl trampoline code for transferring the control flow to
 * ibl lookup
 * - index: the index of special_ibl array to be emitted to
 * - ibl_type: the branch type (IBL_RETURN or IBL_INDCALL)
 * - custom_ilist: the custom instructions added by caller, which are added at
 *   the end of trampoline and right before jump to the ibl routine
 * - tgt: the opnd holding the target, which will be moved into XCX for ibl.
 */
static byte *
emit_special_ibl_xfer(dcontext_t *dcontext, byte *pc, generated_code_t *code,
                      uint index,
                      ibl_branch_type_t ibl_type,
                      instrlist_t *custom_ilist, opnd_t tgt)
{
    instrlist_t ilist;
    patch_list_t patch;
    instr_t *in;
    size_t len;
    byte *ibl_tgt = special_ibl_xfer_tgt(dcontext, code, IBL_LINKED, ibl_type);
    bool absolute = !code->thread_shared;

    ASSERT(ibl_tgt != NULL);
    instrlist_init(&ilist);
    init_patch_list(&patch, absolute ? PATCH_TYPE_ABSOLUTE : PATCH_TYPE_INDIRECT_FS);

    if (DYNAMO_OPTION(indirect_stubs)) {
        const linkstub_t *linkstub =
            get_special_ibl_linkstub(ibl_type,
                                     DYNAMO_OPTION(disable_traces) ? false : true);
        APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG1, TLS_SLOT_REG1));
        APP(&ilist, XINST_CREATE_load_int(dcontext, opnd_create_reg(SCRATCH_REG1),
                                          OPND_CREATE_INTPTR((ptr_int_t)linkstub)));
    }

    if (code->thread_shared || DYNAMO_OPTION(private_ib_in_tls)) {
#ifdef X64
        if (GENCODE_IS_X86_TO_X64(code->gencode_mode) &&
            DYNAMO_OPTION(x86_to_x64_ibl_opt)) {
            APP(&ilist, SAVE_TO_REG(dcontext, SCRATCH_REG2, REG_R9));
        } else
#endif
            APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG2, MANGLE_XCX_SPILL_SLOT));
    } else {
        APP(&ilist, SAVE_TO_DC(dcontext, SCRATCH_REG2, SCRATCH_REG2_OFFS));
    }

    APP(&ilist,
        XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG2), tgt));

    /* insert customized instructions right before xfer to ibl */
    if (custom_ilist != NULL)
        in = instrlist_first(custom_ilist);
    else
        in = NULL;
    while (in != NULL) {
        instrlist_remove(custom_ilist, in);
        APP(&ilist, in);
        in = instrlist_first(custom_ilist);
    }

#ifdef X64
    if (GENCODE_IS_X86(code->gencode_mode))
        instrlist_convert_to_x86(&ilist);
#endif
    /* do not add new instrs that need conversion to x86 below here! */

    /* to support patching the 4-byte pc-rel tgt we must ensure it doesn't
     * cross a cache line
     */
    for (len = 0, in = instrlist_first(&ilist); in != NULL; in = instr_get_next(in)) {
        len += instr_length(dcontext, in);
    }
    if (CROSSES_ALIGNMENT(pc + len + 1/*opcode*/, 4, PAD_JMPS_ALIGNMENT)) {
        instr_t *nop_inst;
        len = ALIGN_FORWARD(pc + len + 1, 4) - (ptr_uint_t)(pc + len + 1);
#ifdef X86
        nop_inst = INSTR_CREATE_nopNbyte(dcontext, (uint)len);
#elif defined(ARM)
        nop_inst = NULL;
        /* FIXMED i#1551: NYI on ARM */
        ASSERT_NOT_IMPLEMENTED(false);
#endif
#ifdef X64
        if (GENCODE_IS_X86(code->gencode_mode)) {
            instr_set_x86_mode(nop_inst, true/*x86*/);
            instr_shrink_to_32_bits(nop_inst);
        }
#endif
        /* XXX: better to put prior to entry point but then need to change model
         * of who assigns entry point
         */
        APP(&ilist, nop_inst);
    }
    APP(&ilist, XINST_CREATE_jump(dcontext, opnd_create_pc(ibl_tgt)));
    add_patch_marker(&patch, instrlist_last(&ilist),
                     PATCH_UINT_SIZED /* pc relative */,
                     0 /* point at opcode of jecxz */,
                     (ptr_uint_t*)&code->special_ibl_unlink_offs[index]);

    /* now encode the instructions */
    pc += encode_with_patch_list(dcontext, &patch, &ilist, pc);
    ASSERT(pc != NULL);

    /* free the instrlist_t elements */
    instrlist_clear(dcontext, &ilist);

    return pc;
}

static void
relink_special_ibl_xfer(dcontext_t *dcontext, int index,
                        ibl_entry_point_type_t entry_type,
                        ibl_branch_type_t ibl_type)
{
    generated_code_t *code;
    byte *pc, *ibl_tgt;
    if (dcontext == GLOBAL_DCONTEXT) {
        ASSERT(!special_ibl_xfer_is_thread_private()); /* else shouldn't be called */
        code = SHARED_GENCODE_MATCH_THREAD(get_thread_private_dcontext());
    } else {
#ifdef X64
        code = SHARED_GENCODE_MATCH_THREAD(dcontext);
#else
        ASSERT(special_ibl_xfer_is_thread_private()); /* else shouldn't be called */
        code = THREAD_GENCODE(dcontext);
#endif
    }
    if (code == NULL) /* shared_code_x86, or thread private that we don't need */
        return;
    ibl_tgt = special_ibl_xfer_tgt(dcontext, code, entry_type, ibl_type);
    ASSERT(code->special_ibl_xfer[index] != NULL);
    pc = (code->special_ibl_xfer[index] +
          code->special_ibl_unlink_offs[index] + 1/*jmp opcode*/);

    protect_generated_code(code, WRITABLE);
    insert_relative_target(pc, ibl_tgt, code->thread_shared/*hot patch*/);
    protect_generated_code(code, READONLY);
}

void
link_special_ibl_xfer(dcontext_t *dcontext)
{
    IF_CLIENT_INTERFACE(relink_special_ibl_xfer(dcontext, CLIENT_IBL_IDX,
                                                IBL_LINKED, IBL_RETURN);)
#ifdef UNIX
    if (DYNAMO_OPTION(native_exec_opt)) {
        relink_special_ibl_xfer(dcontext, NATIVE_PLT_IBL_IDX,
                                IBL_LINKED, IBL_INDCALL);
        relink_special_ibl_xfer(dcontext, NATIVE_RET_IBL_IDX,
                                IBL_LINKED, IBL_RETURN);
    }
#endif
}

void
unlink_special_ibl_xfer(dcontext_t *dcontext)
{
    IF_CLIENT_INTERFACE(relink_special_ibl_xfer(dcontext, CLIENT_IBL_IDX,
                                                IBL_UNLINKED, IBL_RETURN);)
#ifdef UNIX
    if (DYNAMO_OPTION(native_exec_opt)) {
        relink_special_ibl_xfer(dcontext, NATIVE_PLT_IBL_IDX,
                                IBL_UNLINKED, IBL_INDCALL);
        relink_special_ibl_xfer(dcontext, NATIVE_RET_IBL_IDX,
                                IBL_UNLINKED, IBL_RETURN);
    }
#endif
}


#ifdef CLIENT_INTERFACE
/* i#849: low-overhead xfer for clients */
byte *
emit_client_ibl_xfer(dcontext_t *dcontext, byte *pc, generated_code_t *code)
{
    /* The client puts the target in SPILL_SLOT_REDIRECT_NATIVE_TGT. */
    return emit_special_ibl_xfer(dcontext, pc, code, CLIENT_IBL_IDX,
                                 IBL_RETURN, NULL,
                                 reg_spill_slot_opnd
                                 (dcontext, SPILL_SLOT_REDIRECT_NATIVE_TGT));
}

#endif /* CLIENT_INTERFACE */

/* i#171: out-of-line clean call */
/* XXX: i#1149 the clean call context switch should be shared among all threads */
bool
client_clean_call_is_thread_private(void)
{
#ifdef X64
    return false; /* all gencode is shared */
#else
    return !USE_SHARED_GENCODE();
#endif
}

byte *
emit_clean_call_save(dcontext_t *dcontext, byte *pc, generated_code_t *code)
{
    instrlist_t ilist;

    instrlist_init(&ilist);
    /* xref insert_out_of_line_context_switch @ x86/mangle.c,
     * stack was adjusted beyond what we place there to get retaddr
     * in right spot, adjust the stack back to save context
     */
    /* XXX: this LEA can be optimized away by using the LEA
     * in insert_push_all_registers
     */
#ifdef X86
    APP(&ilist, INSTR_CREATE_lea
        (dcontext,
         opnd_create_reg(DR_REG_XSP),
         opnd_create_base_disp(DR_REG_XSP, DR_REG_NULL, 0,
                               (int)(get_clean_call_switch_stack_size() +
                                     get_clean_call_temp_stack_size() +
                                     XSP_SZ /* return addr */),
                               OPSZ_lea)));

    /* save all registers */
    insert_push_all_registers(dcontext, NULL, &ilist, NULL, PAGE_SIZE,
                              INSTR_CREATE_push_imm(dcontext,
                                                    OPND_CREATE_INT32(0)));
#elif defined(ARM)
    /* FIXMED i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
#endif

#if defined(WINDOWS) && defined(CLIENT_INTERFACE)
    /* i#249: isolate the PEB */
    if (INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()) {
        /* We pay the cost of this extra load of dcontext in order to get
         * this code shared (when not shared we place this where we already
         * have the dcontext in a register: see prepare_for_clean_call()).
         */
        if (SCRATCH_ALWAYS_TLS())
            insert_get_mcontext_base(dcontext, &ilist, NULL, SCRATCH_REG0);
        preinsert_swap_peb(dcontext, &ilist, NULL, !SCRATCH_ALWAYS_TLS(),
                           SCRATCH_REG0/*dc*/, SCRATCH_REG2/*scratch*/, true/*to priv*/);
        /* We also need 2 extra loads to restore the 2 regs, in case the
         * clean call passes them as args.
         */
        APP(&ilist, XINST_CREATE_load
            (dcontext, opnd_create_reg(SCRATCH_REG0),
             OPND_CREATE_MEMPTR(REG_XSP, offsetof(priv_mcontext_t, xax))));
        APP(&ilist, XINST_CREATE_load
            (dcontext, opnd_create_reg(SCRATCH_REG2),
             OPND_CREATE_MEMPTR(REG_XSP, offsetof(priv_mcontext_t, xcx))));
    }
#endif

    /* clear eflags */
    insert_clear_eflags(dcontext, NULL, &ilist, NULL);
#ifdef X86
    /* return back */
    APP(&ilist, INSTR_CREATE_lea
        (dcontext, opnd_create_reg(DR_REG_XSP),
         opnd_create_base_disp(DR_REG_XSP, DR_REG_NULL, 0,
                               -(get_clean_call_temp_stack_size() +
                                 (int)XSP_SZ /* return stack */),
                               OPSZ_lea)));
    APP(&ilist, INSTR_CREATE_ret_imm
        (dcontext, OPND_CREATE_INT16(get_clean_call_temp_stack_size())));
#elif defined(ARM)
    /* FIXMED i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
#endif

    /* emti code */
    pc = instrlist_encode(dcontext, &ilist, pc, false);
    ASSERT(pc != NULL);
    instrlist_clear(dcontext, &ilist);
    return pc;
}

byte *
emit_clean_call_restore(dcontext_t *dcontext, byte *pc, generated_code_t *code)
{
    instrlist_t ilist;

    instrlist_init(&ilist);

#if defined(WINDOWS) && defined(CLIENT_INTERFACE)
    /* i#249: isolate the PEB */
    if (INTERNAL_OPTION(private_peb) && should_swap_peb_pointer()) {
        /* We pay the cost of this extra load of dcontext in order to get
         * this code shared (when not shared we place this where we already
         * have the dcontext in a register: see cleanup_after_clean_call()).
         * The 2 regs are dead as the popa will restore.
         */
        if (SCRATCH_ALWAYS_TLS())
            insert_get_mcontext_base(dcontext, &ilist, NULL, SCRATCH_REG0);
        preinsert_swap_peb(dcontext, &ilist, NULL, !SCRATCH_ALWAYS_TLS(),
                           SCRATCH_REG0/*dc*/, SCRATCH_REG2/*scratch*/, false/*to app*/);
    }
#endif

#ifdef X86
    /* adjust the stack for the return target */
    APP(&ilist, INSTR_CREATE_lea
        (dcontext,
         opnd_create_reg(DR_REG_XSP),
         opnd_create_base_disp(DR_REG_XSP, DR_REG_NULL, 0,
                               (int)XSP_SZ, OPSZ_lea)));
    /* restore all registers */
    insert_pop_all_registers(dcontext, NULL, &ilist, NULL, PAGE_SIZE);
    /* return back */
    /* we adjust lea + ret_imm instead of ind jmp to take advantage of RSB */
    APP(&ilist, INSTR_CREATE_lea
        (dcontext,
         opnd_create_reg(DR_REG_XSP),
         opnd_create_base_disp(DR_REG_XSP, DR_REG_NULL, 0,
                               -(get_clean_call_switch_stack_size() +
                                 (int)XSP_SZ /* return address */),
                               OPSZ_lea)));
    APP(&ilist, INSTR_CREATE_ret_imm
        (dcontext,
         OPND_CREATE_INT16(get_clean_call_switch_stack_size())));
#elif defined(ARM)
    /* FIXMED i#1551: NYI on ARM */
    ASSERT_NOT_IMPLEMENTED(false);
#endif
    /* emit code */
    pc = instrlist_encode(dcontext, &ilist, pc, false);
    ASSERT(pc != NULL);
    instrlist_clear(dcontext, &ilist);
    return pc;
}

/* mirrored inline implementation of set_last_exit() */
void
insert_set_last_exit(dcontext_t *dcontext, linkstub_t *l,
                     instrlist_t *ilist, instr_t *where, reg_id_t reg_dc)
{
    ASSERT(l != NULL);

    /* C equivalent:
     *   dcontext->last_exit = l
     */
    insert_mov_immed_ptrsz
        (dcontext, (ptr_int_t) l,
         opnd_create_dcontext_field_via_reg(dcontext, reg_dc, LAST_EXIT_OFFSET),
         ilist, where, NULL, NULL);

    /* C equivalent:
     *   dcontext->last_fragment = linkstub_fragment()
     */
    insert_mov_immed_ptrsz
        (dcontext, (ptr_int_t) linkstub_fragment(dcontext, l),
         opnd_create_dcontext_field_via_reg(dcontext, reg_dc, LAST_FRAG_OFFSET),
         ilist, where, NULL, NULL);

    /* C equivalent:
     *   dcontext->coarse_exit.dir_exit = NULL
     */
    insert_mov_immed_ptrsz
        (dcontext, (ptr_int_t) NULL,
         opnd_create_dcontext_field_via_reg(dcontext, reg_dc,
                                            COARSE_DIR_EXIT_OFFSET),
         ilist, where, NULL, NULL);
}

/* mirrored inline implementation of return_to_native() */
static void
insert_entering_native(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
                       reg_id_t reg_dc, reg_id_t reg_scratch)
{
#ifdef WINDOWS
    /* FIXME i#1238-c#1: we did not turn off asynch interception in windows */
    /* skip C equivalent:
     * set_asynch_interception(dcontext->owning_thread, false)
     */
    ASSERT_BUG_NUM(1238, false && "set_asynch_interception is not inlined");
#endif

    /* C equivalent:
     *   dcontext->thread_record->under_dynamo_control = false
     */
    PRE(ilist, where,
        instr_create_restore_from_dc_via_reg(dcontext, reg_dc, reg_scratch,
                                             THREAD_RECORD_OFFSET));
    PRE(ilist, where,
        XINST_CREATE_store(dcontext,
                           OPND_CREATE_MEM8(reg_scratch,
                                            offsetof(thread_record_t,
                                                     under_dynamo_control)),
                           OPND_CREATE_INT8(false)));

    /* C equivalent:
     *   set_last_exit(dcontext, (linkstub_t *) get_native_exec_linkstub())
     */
    insert_set_last_exit(dcontext,
                         (linkstub_t *) get_native_exec_linkstub(),
                         ilist, where, reg_dc);

    /* XXX i#1238-c#4 -native_exec_opt does not support -kstats
     * skip C equivalent:
     *   KSTOP_NOT_MATCHING(dispatch_num_exits)
     */

    /* skip C equivalent:
     *   SYSLOG_INTERNAL_WARNING_ONCE("entered at least one module natively")
     */

    /* C equivalent:
     *   whereami = WHERE_APP
     */
    PRE(ilist, where,
        instr_create_save_immed_to_dc_via_reg(dcontext, reg_dc, WHEREAMI_OFFSET,
                                              (ptr_int_t) WHERE_APP, OPSZ_4));

    /* skip C equivalent:
     *   STATS_INC(num_native_module_enter)
     */
}

/* mirrored inline implementation of return_to_native()
 * two registers are needed:
 * - reg_dc holds the dcontext
 * - reg_scratch is the scratch register.
 */
void
insert_return_to_native(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
                        reg_id_t reg_dc, reg_id_t reg_scratch)
{
    /* skip C equivalent:
     * ENTERING_DR()
     */
    ASSERT(dcontext != NULL);

    /* C equivalent:
     * entering_native(dcontext)
     */
    insert_entering_native(dcontext, ilist, where, reg_dc, reg_scratch);

    /* skip C equivalent:
     * EXITING_DR()
     */
}

#if defined(UNIX)
static void
insert_entering_non_native(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
                           reg_id_t reg_dc, reg_id_t reg_scratch)
{
    /* C equivalent:
     *   dcontext->thread_record->under_dynamo_control = true
     */
    PRE(ilist, where,
        instr_create_restore_from_dc_via_reg(dcontext, reg_dc, reg_scratch,
                                             THREAD_RECORD_OFFSET));
    PRE(ilist, where,
        XINST_CREATE_store(dcontext,
                           OPND_CREATE_MEM8(reg_scratch,
                                            offsetof(thread_record_t,
                                                     under_dynamo_control)),
                           OPND_CREATE_INT8(true)));

    /* C equivalent:
     *   set_last_exit(dcontext, (linkstub_t *) get_native_exec_linkstub())
     */
    insert_set_last_exit(dcontext,
                         (linkstub_t *) get_native_exec_linkstub(),
                         ilist, where, reg_dc);

    /* C equivalent:
     *   whereami = WHERE_FCACHE
     */
    PRE(ilist, where,
        instr_create_save_immed_to_dc_via_reg(dcontext, reg_dc, WHEREAMI_OFFSET,
                                              (ptr_int_t) WHERE_FCACHE, OPSZ_4));
}

/* Emit code to transfer execution from native module to code cache of non-native
 * module via plt calls.
 * The emitted code update some fields of dcontext like whereami and last_exit,
 * and jump to ibl looking for target code fragment.
 * We assume %XAX holds the target and can be clobbered.
 */
byte *
emit_native_plt_ibl_xfer(dcontext_t *dcontext, byte *pc, generated_code_t *code)
{
    instrlist_t ilist;
    opnd_t tgt = opnd_create_reg(SCRATCH_REG0);

    ASSERT(DYNAMO_OPTION(native_exec_opt));
    instrlist_init(&ilist);
    insert_shared_get_dcontext(dcontext, &ilist, NULL, true);
    insert_entering_non_native(dcontext, &ilist, NULL, REG_NULL, SCRATCH_REG0);
    insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
    return emit_special_ibl_xfer(dcontext, pc, code, NATIVE_PLT_IBL_IDX,
                                 IBL_INDCALL, &ilist, tgt);
}

/* Emit code to transfer execution from native module to code cache of non-native
 * module via return.
 * The emitted code update some fields of dcontext like whereami and last_exit,
 * and jump to ibl looking for target code fragment.
 * We assume %XAX holds the target and must be restored from TLS_SLOT_REG0 before
 * jumpping to ibl.
 */
byte *
emit_native_ret_ibl_xfer(dcontext_t *dcontext, byte *pc, generated_code_t *code)
{
    instrlist_t ilist;
    opnd_t tgt = opnd_create_reg(SCRATCH_REG0);

    ASSERT(DYNAMO_OPTION(native_exec_opt));
    instrlist_init(&ilist);
    insert_shared_get_dcontext(dcontext, &ilist, NULL, true);
    insert_entering_non_native(dcontext, &ilist, NULL, REG_NULL, SCRATCH_REG0);
    insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
    /* restore xax */
    APP(&ilist, instr_create_restore_from_tls(dcontext, SCRATCH_REG0, TLS_SLOT_REG0));
    return emit_special_ibl_xfer(dcontext, pc, code, NATIVE_RET_IBL_IDX,
                                 IBL_RETURN, &ilist, tgt);
}
#endif /* UNIX */
